Script to ban authors from Google Groups

Wanderer wanderer at dialup4less.com
Mon Jul 3 09:30:05 EDT 2017


I use this script to ban authors from Google Groups. You need to create a banned authors text file with each author separated by a new line. For Mozilla you need to compile it to a pyc file, associate pyc files with Python and create a bookmark. You then use the bookmark to enter google groups web page.

# remove banned author and authors with mostly caps

# to compile to pyc
#>>>import py_compile
#>>>py_compile.compile("file.py")


import urllib2
import webbrowser
import os
from bs4 import BeautifulSoup

PALEMOON = 'Mozilla/5.0 (Windows NT 6.1; WOW64) KHTML/4.11 Gecko/20130308 Firefox/33.0 (PaleMoon/25.2)'
WATERFOX = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:40.0) Gecko/20100101 Firefox/51.1.0 Waterfox/51.1.0'
USERAGENTBASE = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:40.0) Gecko/20100101 '
BROWSERPATH = 'C:\\"Program Files"\\Waterfox\\waterfox.exe'
FILENAME = 'C:\\PyStuff\\pygroup.htm'
WEBPAGE = "https://groups.google.com/forum/?_escaped_fragment_=forum/comp.lang.python%5B1-50%5D"
BANNED_AUTHORS_FILE = 'C:\\PyStuff\\bannedAuthors.txt'

def getUserAgentVersion():
    """ get the useragent version
        returns agentVersion
        -- user agent version in format Firefox/51.0.1 Waterfox/51.0.1
    """
    bvers = os.popen(BROWSERPATH + " -v").read()
    bversList = bvers.split()
    agentVersion = 'Firefox/' + bversList[2] + ' ' + bversList[1] + '/' + bversList[2]
    return agentVersion    

def getwebpage(url):
    """ Open a webpage 
    url --  the url to the webpage
    returns 
    page -- the source for the webpage
    """
    user_agent = USERAGENTBASE + getUserAgentVersion()
    headers = { 'User-Agent' : user_agent }
    req = urllib2.Request(url, None, headers)
    response = urllib2.urlopen(req)
    page = response.read()
    return page

def getBannedAuthors():
    """ Convert the banned authors text file into a list
    returns
    bannedAuthors -- list of banned author strings
    """
    f = open(BANNED_AUTHORS_FILE, 'r')
    bannedAuthors = f.read().split('\n')
    f.close()
    return bannedAuthors

def removeBadAuthors(html_doc):
    """ Remove posts from google group by authors that are mostly caps or on the Banned List
    html_doc -- an html document
    """
    bannedAuthors = getBannedAuthors()
    print bannedAuthors
    
    soup = BeautifulSoup(html_doc)
    #print soup.prettify()
    post = soup.find("tr")
    while post is not None:
        author = post.find("td", "author")
        aname = author.get_text()
        if author is None:
            print "Author is None"
            oldpost = post
            post = oldpost.find_next_sibling('tr')
            oldpost.decompose()
        elif aname in bannedAuthors:
            print "Author is Banned"
            oldpost = post
            post = oldpost.find_next_sibling('tr')
            oldpost.decompose()
        else:
            print author
            numCaps = 1.0 * sum(1 for c in aname if c.isupper())
            ratio = numCaps/(1.0*len(aname))
            print ratio
            oldpost = post
            post = oldpost.find_next_sibling('tr')
            if ratio > 0.7:
                oldpost.decompose()
                print "BIG"
        if post is None: print "Post is NONE"
    f = open(FILENAME, 'w')
    f.write(soup.prettify().encode('ascii', 'ignore'))
    f.close()
    
         
def main():
    html_doc = getwebpage(WEBPAGE)
    removeBadAuthors(html_doc)
    webbrowser.open(FILENAME)
    print 'done'
    
if __name__ == "__main__":
    main()    



More information about the Python-list mailing list