[Spambayes] Latest spammer trick stymied - QUESTION
Richard Jowsey
richard at jowsey.com
Sat Apr 5 08:21:12 EST 2003
[Tony]
> You can still donate the source so that it can be converted into a
> nice language ;)
Prototype slurper is attached! Note that the recursive re-
direct/refresh logic is a bit Q&D, and needs some more tidying up
(there's only 24 hours in a day, dammit).
[Richard]
> > As for consensus, I'm biased, but I don't really think there
> > is one.
[Tony]
> Hmm. I read all the messages immediately after each other [snip]
> There weren't a lot of people saying that they thought it was a good
> thing to include it, but there were some saying that they were against
> it. Then there were a lot posting queries and answers to that.
> Overall, it did seem very anti the idea.
Perhaps you're right. There does seem to be a dollop of scepticism
about the value of something like this. Funny, coz my beta users
*love* it!
> Perhaps, as a test, someone could convert the code to Python and
> it could be committed (as an option that defaults to False). If test
> results really support it (and no-one can come up with a pure
> tokenisation alternative), it could be left there (still defaulting to
> False, unless all the concerns are addressed).
Fair enough!
Cheers,
Richard
-------------- next part --------------
/*
* @(#) UrlSlurper.java
*/
package net.death2spam.http;
import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.net.MalformedURLException;
import java.net.Socket;
import java.net.URL;
import java.security.Security;
import javax.net.ssl.SSLSocketFactory;
import net.death2spam.Statics;
import net.death2spam.io.BigByteInputStream;
import net.death2spam.log.HttpLogger;
import com.sun.net.ssl.HttpsURLConnection;
import com.sun.net.ssl.internal.ssl.Provider;
/**
* Gets HTML from a URL for word-frequency analysis.
*/
public final class UrlSlurper {
/* The URL to retrieve */
protected URL url = null;
/* Whether to only obtain the HTTP headers */
protected boolean head = false;
/* Hidden default constructor */
private UrlSlurper() {}
/** Constructor taking a URL argument */
public UrlSlurper(URL url) {
this.url = url;
}
/** Constructor taking URL and head-flag parameters */
public UrlSlurper(URL url, boolean head) {
this.url = url;
this.head = head;
}
/** @return the URL's contents as a String */
public String retrieve() {
String html = "";
Socket socket = null;
try {
HttpLogger.trace("UrlSlurper.retrieve() " + url.toString());
String host = url.getHost();
HttpLogger.trace("UrlSlurper.retrieve() contacting " + host);
int port = url.getPort();
port = (port < 1) ? 80 : port;
if (port == 443) {
Security.addProvider(new Provider());
System.setProperty("java.protocol.handler.pkgs",
"com.sun.net.ssl.internal.www.protocol");
SSLSocketFactory sslFactory = HttpsURLConnection.getDefaultSSLSocketFactory();
socket = sslFactory.createSocket(host, port);
} else
socket = new Socket(host, port);
// create the socket reader and writer
HttpLogger.trace("UrlSlurper.retrieve() " + socket.toString());
BigByteInputStream in = new BigByteInputStream(
socket.getInputStream(), Statics.BUFFER_SIZE);
BufferedOutputStream out = new BufferedOutputStream(
socket.getOutputStream(), Statics.BUFFER_SIZE);
// construct the request headers
String file = url.getFile();
if (file == null || file.length() == 0)
file = "/";
String httpRequest = (head ? "HEAD " : "GET ") + file + " HTTP/1.0" + Statics.CRLF;
httpRequest += "Host: " + host + (port == 80 ? "" : ":" + port) + Statics.CRLF;
httpRequest += "User-Agent: Death2Spam/1.0 (compatible; en)" + Statics.CRLF;
httpRequest += "Content-Type: text/html" + Statics.CRLF;
httpRequest += "Accept: */*" + Statics.CRLF;
httpRequest += "Accept-Language: en" + Statics.CRLF;
httpRequest += "Accept-Charset: iso-8859-1, *, utf-8" + Statics.CRLF;
httpRequest += "Connection: close" + Statics.CRLF;
HttpLogger.trace("UrlSlurper.retrieve() request:" + Statics.CRLF + httpRequest);
httpRequest += Statics.CRLF; // end of RFC-822 header section
// send the request
PrintStream ps = new PrintStream(out, true);
ps.print(httpRequest);
ps.flush();
// read the response
html = in.toString();
HttpLogger.trace("UrlSlurper.retrieve() response:" + Statics.CRLF + html);
}
catch (Exception e) {
if (Statics.debug)
e.printStackTrace(System.err);
HttpLogger.error(e.toString());
}
finally {
if (socket != null) {
try {
socket.close();
} catch (IOException ignore) {}
}
}
return html;
}
/**
* Recursively follows a redirect or refresh trail.
* @param url The page being retrieved.
* @param response Contents of the http response.
* @param headReq whether it's a HEAD request.
* @return The response from any re-directs detected.
* */
public String handleRedirect(URL url, String response, boolean headReq) {
String protocol = url.getProtocol();
String host = url.getHost();
int port = url.getPort();
port = (port > -1) ? port : 80;
String file = url.getFile();
int posCRLF = response.indexOf(Statics.CRLF);
if (posCRLF < 0)
posCRLF = response.indexOf('\n');
String first = response.substring(0, posCRLF).trim();
int posCode = first.indexOf(' ') + 1;
int code = 200;
if (first.indexOf("HTTP") > -1) {
try {
code = Integer.parseInt(first.substring(posCode, posCode + 3));
} catch (NumberFormatException nfe) {
code = 404; // assume a file-not-found error
}
}
try {
if (code > 499) {
if (Statics.debug)
System.err.println(first);
}
else if (code > 399) {
// 404 not found, etc
if (host.startsWith("www."))
return response;
else {
int posDot = host.indexOf('.') + 1;
if (host.indexOf('.', posDot) < 0)
return response;
url = new URL(protocol, "www." + host.substring(posDot).trim(), port, file);
UrlSlurper slurper = new UrlSlurper(url, headReq);
response = slurper.retrieve();
return handleRedirect(url, response, headReq);
}
}
else if (code == 301 || code == 302 || code == 303) {
// moved, found or see-other
String res = response.toLowerCase();
int start = res.indexOf("location:");
if (start < 0) {
int posDot = host.indexOf('.') + 1;
url = new URL(protocol, "www." + host.substring(posDot).trim(), port, file);
}
else if (res.substring(start + 9, start + 15).trim().startsWith("http")) {
start = res.indexOf("http", start + 9);
int end = res.indexOf('\r', start);
url = new URL(response.substring(start, end).trim());
} else {
start += 9;
int end = res.indexOf('\r', start);
file = response.substring(start, end).trim();
url = new URL(protocol, host, port, file);
}
UrlSlurper slurper = new UrlSlurper(url, headReq);
response = slurper.retrieve();
return handleRedirect(url, response, headReq);
}
else if (response.toLowerCase().indexOf("<meta http-equiv=\"refresh\"") > -1) {
// meta refresh tag in header
String res = response.toLowerCase();
int start = res.indexOf("<meta http-equiv=\"refresh\"");
start = res.indexOf("http", start + 27);
int end = res.indexOf('"', start);
url = new URL(response.substring(start, end).trim());
UrlSlurper slurper = new UrlSlurper(url, headReq);
response = slurper.retrieve();
return handleRedirect(url, response, headReq);
}
} catch (MalformedURLException mfe) {}
return response;
}
/** A main method for unit testing */
public static void main(String args[]) {
Statics.debug = true;
try {
URL url = new URL(args[0]);
UrlSlurper slurper = new UrlSlurper(url, false);
String response = slurper.retrieve();
if (response == null || response.length() == 0)
throw new RuntimeException("response is null or empty");
else
response = slurper.handleRedirect(url, response, false);
System.out.println(response);
/*
Map map = new WordFreqMap();
MsgTokenizer mt = new MsgTokenizer();
mt.analyzeContent(response, new ContentType("text/html"), map);
float pSpam = (new Classifier()).chi2SpamProb(map);
System.out.println("pSpam=" + Classifier.prob2String(pSpam));
*/
}
catch (Exception e) {
e.printStackTrace(System.err);
}
}
}
More information about the Spambayes
mailing list