[Spambayes] Latest spammer trick stymied - QUESTION

Sat Apr 5 08:21:12 EST 2003

[Tony]
> You can still donate the source so that it can be converted into a
> nice language ;)

Prototype slurper is attached! Note that the recursive re-
direct/refresh logic is a bit Q&D, and needs some more tidying up 
(there's only 24 hours in a day, dammit).

[Richard]
> > As for consensus, I'm biased, but I don't really think there 
> > is one.
[Tony]
> Hmm. I read all the messages immediately after each other [snip]
> There weren't a lot of people saying that they thought it was a good
> thing to include it, but there were some saying that they were against 
> it. Then there were a lot posting queries and answers to that.  
> Overall, it did seem very anti the idea.

Perhaps you're right. There does seem to be a dollop of scepticism 
about the value of something like this. Funny, coz my beta users 
*love* it!

> Perhaps, as a test, someone could convert the code to Python and
> it could be committed (as an option that defaults to False).  If test
> results really support it (and no-one can come up with a pure
> tokenisation alternative), it could be left there (still defaulting to
> False, unless all the concerns are addressed).

Fair enough!

Cheers,
Richard

-------------- next part --------------
/*
 * @(#) UrlSlurper.java
 */

package net.death2spam.http;

import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.net.MalformedURLException;
import java.net.Socket;
import java.net.URL;
import java.security.Security;

import javax.net.ssl.SSLSocketFactory;

import net.death2spam.Statics;
import net.death2spam.io.BigByteInputStream;
import net.death2spam.log.HttpLogger;

import com.sun.net.ssl.HttpsURLConnection;
import com.sun.net.ssl.internal.ssl.Provider;

/**
 * Gets HTML from a URL for word-frequency analysis.
 */
public final class UrlSlurper {

   /* The URL to retrieve */
   protected URL url = null;

   /* Whether to only obtain the HTTP headers */
   protected boolean head = false;

   /* Hidden default constructor */
   private UrlSlurper() {}

   /** Constructor taking a URL argument */
   public UrlSlurper(URL url) {
      this.url = url;
   }

   /** Constructor taking URL and head-flag parameters */
   public UrlSlurper(URL url, boolean head) {
      this.url = url;
      this.head = head;
   }

   /** @return the URL's contents as a String */
   public String retrieve() {
      String html = "";
      Socket socket = null;
      try {
         HttpLogger.trace("UrlSlurper.retrieve() " + url.toString());
         String host = url.getHost();
         HttpLogger.trace("UrlSlurper.retrieve() contacting " + host);

         int port = url.getPort();
           port = (port < 1) ? 80 : port;
         if (port == 443) {
            Security.addProvider(new Provider());
            System.setProperty("java.protocol.handler.pkgs",
               "com.sun.net.ssl.internal.www.protocol");
            SSLSocketFactory sslFactory = HttpsURLConnection.getDefaultSSLSocketFactory();
            socket = sslFactory.createSocket(host, port);
         } else
            socket = new Socket(host, port);

         // create the socket reader and writer
         HttpLogger.trace("UrlSlurper.retrieve() " + socket.toString());
         BigByteInputStream in = new BigByteInputStream(
            socket.getInputStream(), Statics.BUFFER_SIZE);
         BufferedOutputStream out = new BufferedOutputStream(
            socket.getOutputStream(), Statics.BUFFER_SIZE);

         // construct the request headers
         String file = url.getFile();
         if (file == null || file.length() == 0)
            file = "/";
         String httpRequest = (head ? "HEAD " : "GET ") + file + " HTTP/1.0" + Statics.CRLF;
         httpRequest += "Host: " + host + (port == 80 ? "" : ":" + port) + Statics.CRLF;
         httpRequest += "User-Agent: Death2Spam/1.0 (compatible; en)" + Statics.CRLF;
         httpRequest += "Content-Type: text/html" + Statics.CRLF;
         httpRequest += "Accept: */*" + Statics.CRLF;
         httpRequest += "Accept-Language: en" + Statics.CRLF;
         httpRequest += "Accept-Charset: iso-8859-1, *, utf-8" + Statics.CRLF;
         httpRequest += "Connection: close" + Statics.CRLF;
         HttpLogger.trace("UrlSlurper.retrieve() request:" + Statics.CRLF + httpRequest);
         httpRequest += Statics.CRLF;  // end of RFC-822 header section

         // send the request
         PrintStream ps = new PrintStream(out, true);
         ps.print(httpRequest);
         ps.flush();

         // read the response
         html = in.toString();
         HttpLogger.trace("UrlSlurper.retrieve() response:" + Statics.CRLF + html);
      }
      catch (Exception e) {
         if (Statics.debug)
            e.printStackTrace(System.err);
         HttpLogger.error(e.toString());
      }
      finally {
         if (socket != null) {
            try {
               socket.close();
            } catch (IOException ignore) {}
         }
      }
      return html;
   }

   /**
    * Recursively follows a redirect or refresh trail.
    * @param  url  The page being retrieved.
    * @param  response  Contents of the http response.
    * @param  headReq  whether it's a HEAD request.
    * @return  The response from any re-directs detected.
    * */
   public String handleRedirect(URL url, String response, boolean headReq) {
      String protocol = url.getProtocol();
      String host = url.getHost();
      int port = url.getPort();
      port = (port > -1) ? port : 80;
      String file = url.getFile();

      int posCRLF = response.indexOf(Statics.CRLF);
      if (posCRLF < 0)
         posCRLF = response.indexOf('\n');
      String first = response.substring(0, posCRLF).trim();
      int posCode = first.indexOf(' ') + 1;
      int code = 200;
      if (first.indexOf("HTTP") > -1) {
         try {
            code = Integer.parseInt(first.substring(posCode, posCode + 3));
         } catch (NumberFormatException nfe) {
            code = 404;  // assume a file-not-found error
         }
      }

      try {
         if (code > 499) {
            if (Statics.debug)
               System.err.println(first);
         }
         else if (code > 399) {
            // 404 not found, etc
            if (host.startsWith("www."))
               return response;
            else {
               int posDot = host.indexOf('.') + 1;
               if (host.indexOf('.', posDot) < 0)
                  return response;
               url = new URL(protocol, "www." + host.substring(posDot).trim(), port, file);
               UrlSlurper slurper = new UrlSlurper(url, headReq);
               response = slurper.retrieve();
               return handleRedirect(url, response, headReq);
            }
         }
         else if (code == 301 || code == 302 || code == 303) {
            // moved, found or see-other
            String res = response.toLowerCase();
            int start = res.indexOf("location:");
            if (start < 0) {
               int posDot = host.indexOf('.') + 1;
               url = new URL(protocol, "www." + host.substring(posDot).trim(), port, file);
            }
            else if (res.substring(start + 9, start + 15).trim().startsWith("http")) {
               start = res.indexOf("http", start + 9);
               int end = res.indexOf('\r', start);
               url = new URL(response.substring(start, end).trim());
            } else {
               start += 9;
               int end = res.indexOf('\r', start);
               file = response.substring(start, end).trim();
               url = new URL(protocol, host, port, file);
            }
            UrlSlurper slurper = new UrlSlurper(url, headReq);
            response = slurper.retrieve();
            return handleRedirect(url, response, headReq);
         }
         else if (response.toLowerCase().indexOf("<meta http-equiv=\"refresh\"") > -1) {
            // meta refresh tag in header
            String res = response.toLowerCase();
            int start = res.indexOf("<meta http-equiv=\"refresh\"");
            start = res.indexOf("http", start + 27);
            int end = res.indexOf('"', start);
            url = new URL(response.substring(start, end).trim());
            UrlSlurper slurper = new UrlSlurper(url, headReq);
            response = slurper.retrieve();
            return handleRedirect(url, response, headReq);
         }
      } catch (MalformedURLException mfe) {}
      return response;
   }

   /** A main method for unit testing */
   public static void main(String args[]) {
      Statics.debug = true;
      try {
         URL url = new URL(args[0]);
         UrlSlurper slurper = new UrlSlurper(url, false);
         String response = slurper.retrieve();
         if (response == null || response.length() == 0)
            throw new RuntimeException("response is null or empty");
         else
            response = slurper.handleRedirect(url, response, false);
         System.out.println(response);

         /*
         Map map = new WordFreqMap();
         MsgTokenizer mt = new MsgTokenizer();
         mt.analyzeContent(response, new ContentType("text/html"), map);
         float pSpam = (new Classifier()).chi2SpamProb(map);
         System.out.println("pSpam=" + Classifier.prob2String(pSpam));
         */
      }
      catch (Exception e) {
         e.printStackTrace(System.err);
      }
   }
}