FYI: Removing posts with All Cap Authors
Wanderer
wanderer at dialup4less.com
Sat Mar 4 11:22:34 EST 2017
I mostly just lurk and view the post titles to see if something interesting is being discussed. This code gets me a web page without the spam. You need to compile it to a pyc file and create a bookmark. Probably not useful for most people who don't use their browsers the way I do, but here it is.
# remove authors with mostly caps
import urllib2
import webbrowser
import os
from bs4 import BeautifulSoup
USERAGENTBASE = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:40.0) Gecko/20100101 '
BROWSERPATH = 'C:\\"Program Files"\\Waterfox\\waterfox.exe'
FILENAME = 'C:\\PyStuff\\pygroup.htm'
WEBPAGE = "https://groups.google.com/forum/?_escaped_fragment_=forum/comp.lang.python"
def getUserAgentVersion():
""" get the useragent version
returns agentVersion
-- user agent version in format Firefox/51.0.1 Waterfox/51.0.1
"""
bvers = os.popen(BROWSERPATH + " -v").read()
bversList = bvers.split()
agentVersion = 'Firefox/' + bversList[2] + ' ' + bversList[1] + '/' + bversList[2]
return agentVersion
def getwebpage(url):
""" Open a webpage
url -- the url to the webpage
returns
page -- the source for the webpage
"""
user_agent = USERAGENTBASE + getUserAgentVersion()
headers = { 'User-Agent' : user_agent }
req = urllib2.Request(url, None, headers)
response = urllib2.urlopen(req)
page = response.read()
return page
def removeAllCaps(html_doc):
""" Remove posts from google group by authors that is mostly caps
html_doc -- an html document
"""
soup = BeautifulSoup(html_doc)
#print soup.prettify()
post = soup.find("tr")
while post is not None:
author = post.find("td", "author")
print author
aname = author.get_text()
numCaps = 1.0 * sum(1 for c in aname if c.isupper())
ratio = numCaps/(1.0*len(aname))
print ratio
oldpost = post
post = post.find_next_sibling('tr')
if ratio > 0.7:
oldpost.decompose()
print "BIG"
f = open(FILENAME, 'w')
f.write(soup.prettify().encode('ascii', 'ignore'))
f.close()
def main():
html_doc = getwebpage(WEBPAGE)
removeAllCaps(html_doc)
webbrowser.open(FILENAME)
print 'done'
if __name__ == "__main__":
main()
More information about the Python-list
mailing list