Filtering computer.lang.python

Wanderer wanderer at dialup4less.com
Tue Apr 10 11:28:58 EDT 2018


On Tuesday, April 10, 2018 at 3:28:05 AM UTC-4, Thomas Jollans wrote:
> On 2018-04-10 07:06, T Berger wrote:
> > This is the first time I've joined a google group and I don't understand the setup. Why are most of the posts in this group unrelated to python, and how do I filter this junk (sorry) out?
> > 
> 
> Welcome to python-list/comp.lang.python!
> 
> This isn't originally a Google group. Google just mirrors the old USENET
> group, which is awash with spam.
> 
> There is also a mailing list version of this group (posts are mirrored
> both ways) at https://mail.python.org/mailman/listinfo/python-list
> 
> The mailing list has proper spam filtering and some moderation. None (or
> barely any) of the regulars use Google Groups. Some people use USENET
> directly and maintain their own extensive filtering regime to make it
> readable. Probably most of us use the mailing list, because it's just so
> much nicer!
> 
> -- Thomas

Here's my python code for filtering google groups again. You need to bookmark pyc files to run them from the bookmarks in firefox. You also need to create the bannedAuthors.txt and bannedSubjects.txt files. 



# remove banned author and authors with mostly caps

# to compile to pyc
#>>>import py_compile
#>>>py_compile.compile("file.py")


import urllib2
import webbrowser
import os
from bs4 import BeautifulSoup
import argparse

class Usage(Exception):
    def __init__(self, msg):
        self.msg = msg


PALEMOON = 'Mozilla/5.0 (Windows NT 6.1; WOW64) KHTML/4.11 Gecko/20130308 Firefox/33.0 (PaleMoon/25.2)'
WATERFOX = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:40.0) Gecko/20100101 Firefox/51.1.0 Waterfox/51.1.0'
USERAGENTBASE = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:40.0) Gecko/20100101 '
BROWSERPATH = 'C:\\"Program Files"\\Waterfox\\waterfox.exe'
FILENAME = 'C:\\Pystuff\\pygroup.htm'
SEDFILENAME = 'C:\\Pystuff\\SED.htm'
WEBPAGE_START = "https://groups.google.com/forum/?_escaped_fragment_=forum/"
PYGROUP_WEBPAGE = "comp.lang.python%5B"
SED_WEBPAGE = "sci.electronics.design%5B"
WEBPAGE_END = "%5D"
BANNED_AUTHORS_FILE = 'C:\\Pystuff\\bannedAuthors.txt'
BANNED_SUBJECTS_FILE = 'C:\\Pystuff\\bannedSubjects.txt'

def getUserAgentVersion():
    """ get the useragent version
        returns agentVersion
        -- user agent version in format Firefox/51.0.1 Waterfox/51.0.1
    """
    bvers = os.popen(BROWSERPATH + " -v").read()
    bversList = bvers.split()
    agentVersion = 'Firefox/' + bversList[2] + ' ' + bversList[1] + '/' + bversList[2]
    return agentVersion    

def getwebpage(url):
    """ Open a webpage 
    url --  the url to the webpage
    returns 
    page -- the source for the webpage
    """
    user_agent = USERAGENTBASE + getUserAgentVersion()
    headers = { 'User-Agent' : user_agent }
    req = urllib2.Request(url, None, headers)
    response = urllib2.urlopen(req)
    page = response.read()
    return page

def getBannedAuthors():
    """ Convert the banned authors text file into a list
    returns
    bannedAuthors -- list of banned author strings
    """
    f = open(BANNED_AUTHORS_FILE, 'r')
    bannedAuthors = f.read().split('\n')
    f.close()
    return bannedAuthors

def getBannedSubjects():
    """ Convert the banned subjects text file into a list
    returns
    bannedAuthors -- list of banned author strings
    """
    f = open(BANNED_SUBJECTS_FILE, 'r')
    bannedSubjects = f.read().split('\n')
    f.close()
    return bannedSubjects

def removeBadAuthors(html_doc, filecode):
    """ Remove posts from google group by authors that are mostly caps or on the Banned List
    html_doc -- an html document
    """
    bannedAuthors = getBannedAuthors()
    bannedSubjects = getBannedSubjects()
    #print bannedAuthors
    
    soup = BeautifulSoup(html_doc)
    #print soup.prettify()
    post = soup.find("tr")
    postcount = 0
    banNoneCount = 0
    banNameCount = 0
    banBigCount = 0
    banSubjectCount = 0
    while post is not None:
        postcount += 1
        author = post.find("td", "author")
        subject = post.find("td", "subject")        
        if author is None or subject is None:
            print "Author is None"
            oldpost = post
            post = oldpost.find_next_sibling('tr')
            oldpost.decompose()
            postcount = postcount - 1
            banNoneCount += 1
        else:
            aname = author.get_text()
            print aname.encode("ascii", "ignore")
            asubject = ((subject.get_text()).lower()).replace(" ", "")
            bannedsubject = False
            for badsubject in bannedSubjects:
                print "BAD SUBJECT", badsubject
                if badsubject in asubject and len(badsubject) > 3:
                    print "ASUBJECT", asubject.encode("ascii", "ignore")
                    bannedsubject = True
                    break
            if bannedsubject:
                print "Subject is Banned"
                oldpost = post
                post = oldpost.find_next_sibling('tr')
                oldpost.decompose()
                postcount = postcount - 1
                banSubjectCount += 1   
            elif aname in bannedAuthors or \
                 'smtb' in aname:
                print "Author is Banned"
                oldpost = post
                post = oldpost.find_next_sibling('tr')
                oldpost.decompose()
                postcount = postcount - 1
                banNameCount += 1
            else:
                print author
                numCaps = 1.0 * sum(1 for c in aname if c.isupper())
                ratio = numCaps/(1.0*len(aname))
                print ratio
                oldpost = post
                post = oldpost.find_next_sibling('tr')
                if ratio > 0.7 or len(aname) > 35:
                    oldpost.decompose()
                    postcount = postcount - 1
                    banBigCount += 1
                    print "BIG"
        if post is None: print "Post is NONE"
    f = open(FILENAME, filecode)
    f.write(soup.prettify().encode('ascii', 'ignore') + '<br>\n\r')
    f.write('<a> Banned No Name: ' + str(banNoneCount) + '</a>, ')
    f.write('<a> Banned Name: ' + str(banNameCount) + '</a>, ')
    f.write('<a> All Uppercase Name: ' + str(banBigCount) + '</a>, ')
    f.write('<a> Banned Subject: ' + str(banSubjectCount) + '</a>, ')
    f.write('<a> Total Banned: ' + str(banNoneCount +banNameCount + banBigCount + banSubjectCount) + '</a><br>\n\r')
    f.close()
    return postcount
    
         
def main(sed = None):
    if sed is None:
        parser = argparse.ArgumentParser()
        parser.add_argument('-s', '--sed' , help="load sci.electronics.design group", action="store_true")
        args = parser.parse_args()
        if args.sed:
            webgroup = SED_WEBPAGE
        else:
            webgroup = PYGROUP_WEBPAGE
    else:
        if sed:
            webgroup = SED_WEBPAGE
        else:
            webgroup = PYGROUP_WEBPAGE
            
    
    postcount = 0
    numberOposts = 0
    filecode = 'w'
    while postcount < 10:
        webpage = WEBPAGE_START + webgroup + str(numberOposts + 1) + '-' + str(numberOposts + 50) + WEBPAGE_END 
        print webpage
        html_doc = getwebpage(webpage)
        postcount += removeBadAuthors(html_doc, filecode)
        if postcount < 10:
            numberOposts += 50
            filecode = 'a'
            print "postcount less than 10", postcount
            print "number of posts", numberOposts
        
    webbrowser.open(FILENAME)
        
    print 'done'
    
if __name__ == "__main__":
    main()    
    



More information about the Python-list mailing list