Problem with socket.recv()

xreload xreload at gmail.com
Thu May 17 03:51:15 EDT 2007


Hello !

I have some class for getting html documents :

"""
    Wrapper for Python sockets lib
"""
import socket
import urlparse
import random
import io
import re
import sys

# socket wrapper class
class sock:
    def __init__(self,url):
        parse = urlparse.urlparse(url)

        self.req = []  # request tuple
        self.response = "" # response data
        self.port = socket.getservbyname("www","tcp")  # remote host
port
        if parse[2] is not '':
            if parse[4] is not '':
                self.path = parse[2] + "?" + parse[4]
            else:
                self.path = parse[2]
        else:
            self.path = "/"  # request path

        if parse[1] is not '':
            self.host = parse[1]  # remote host name
        else:
            self.host = ""
        self.req.append("GET " + self.path + " HTTP/1.1")
        self.req.append("Host: " + self.host)
    # set user-agent
    def useragent(self, useragent):
        self.req.append("User-Agent: " + useragent)
    # set document max size in bytes
    def range(self, size=0):
        self.range = size
    # get response document body
    def get_body(self):
        body = self.response.split("\r\n\r\n", 2)
        try:
            return body[1]
        except:
            return self.response
    # do http request
    def request(self, timeout=60,chunk=1024):
        self.req.append("Accept: */*")
        self.req.append("Pragma: no-cache")
        self.req.append("Connection: close")

        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        s.settimeout(timeout)

        try:
            s.connect((self.host,self.port))
        except:
            print "Cant connect to remote host: "+self.host

        try:
           s.sendall("\r\n".join(self.req)+"\r\n\r\n")
        except:
           print "Cant write data to socket"

        while 1:
            try:
                buffer = s.recv(chunk)
            except:
                print "Cant read data from socket."
                break
            if not buffer :
                break
            self.response = self.response+buffer
            if len(self.response) > self.range and self.range != 0:
                print "Document is too big"
 
break
        try:
            s.close()
        except:
            print "Cant close socket"

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print '\nNo URL specified for module test.\nUsage: sock.py
<URL>'
        sys.exit()
    test = sock(sys.argv[1])
    test.useragent("Mozilla/4.0 (compatible; MSIE 5.5; Windows NT
4.0)")
    test.range()
    test.request()
    print test.get_body()

-----------
So, lets do :
sock.py "http://forums.childrenwithdiabetes.com/showthread.php?t=5030"
- it not ok , only some part of document.
wget "http://forums.childrenwithdiabetes.com/showthread.php?t=5030" -
it ok !
sock.py "http://www.google.com/" - it ok !

Why i got only some part of document ? This is some bug in sockets
module or i do something wrong in my code?

Help me please , iam "googled" several hours , but not found any
related information.

All my bests.Igor.




More information about the Python-list mailing list