urllib源码简单分析

对下面这段代码做分析

import urllib
params = urllib.urlencode({'wd': 'python'})
f = urllib.urlopen("http://www.baidu.com/s?%s" % params)
print f.read()

这是一段简单读取url内容的代码

此处最关键的是urlopen，通过查看，可以看到urlopen的代码如下

def urlopen(url, data=None, proxies=None):
    """Create a file-like object for the specified URL to read from."""
    from warnings import warnpy3k
    warnpy3k("urllib.urlopen() has been removed in Python 3.0 in "
             "favor of urllib2.urlopen()", stacklevel=2)

    global _urlopener
    if proxies is not None:
        opener = FancyURLopener(proxies=proxies)
    elif not _urlopener:
        opener = FancyURLopener()
        _urlopener = opener
    else:
        opener = _urlopener
    if data is None:
        return opener.open(url)
    else:
        return opener.open(url, data)

通过一个FancyURLopener的opener实例，因为这里没有proxies参数，所以调用到opener = FancyURLopener()这一句。

然后返回opener.open(url)，绑定到f实例上。在这里，有两个关键，一个是opener实例，一个是open方法。

先来说说opener，opener是FancyURLopener()的对象,而FancyURLopener的父类是URLopener基类，而FancyURLopener这个类本身只做了一些http的异常响应处理，因此我们需要了解核心的基类，也就是看看URLopener到底做了什么？

URLopener:通过查看源码，发现URLopener的主要处理方法是open。

    def open(self, fullurl, data=None):
        """Use URLopener().open(file) instead of open(file, 'r')."""
        fullurl = unwrap(toBytes(fullurl))
        # percent encode url, fixing lame server errors for e.g, like space
        # within url paths.
        fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
        if self.tempcache and fullurl in self.tempcache:
            filename, headers = self.tempcache[fullurl]
            fp = open(filename, 'rb')
            return addinfourl(fp, headers, fullurl)
        urltype, url = splittype(fullurl)
        if not urltype:
            urltype = 'file'
        if urltype in self.proxies:
            proxy = self.proxies[urltype]
            urltype, proxyhost = splittype(proxy)
            host, selector = splithost(proxyhost)
            url = (host, fullurl) # Signal special case to open_*()
        else:
            proxy = None
        name = 'open_' + urltype
        self.type = urltype
        name = name.replace('-', '_')
        if not hasattr(self, name):
            if proxy:
                return self.open_unknown_proxy(proxy, fullurl, data)
            else:
                return self.open_unknown(fullurl, data)
        try:
            if data is None:
                return getattr(self, name)(url)
            else:
                return getattr(self, name)(url, data)
        except socket.error, msg:
            raise IOError, ('socket error', msg), sys.exc_info()[2]

open通过处理url，将name拼接成name = 'open_' + urltype的格式，也就是说，如果是http请求，则name为open_http。在上文那段代码里，最后调用返回的是getattr(self, name)(url)，而由于name变成了open_http,则继续调用open_http方法。在这里可以看出，urllib根据你的type来给出不同的方法作处理。

那么又要看看open_http干了什么.

通过debug发现，open_http其实是用httplib来做底层处理的

    def open_http(self, url, data=None):
        """Use HTTP protocol."""
        import httplib
        user_passwd = None
        proxy_passwd= None
        if isinstance(url, str):
            host, selector = splithost(url)
            if host:
                user_passwd, host = splituser(host)
                host = unquote(host)
            realhost = host
        else:
            host, selector = url
            # check whether the proxy contains authorization information
            proxy_passwd, host = splituser(host)
            # now we proceed with the url we want to obtain
            urltype, rest = splittype(selector)
            url = rest
            user_passwd = None
            if urltype.lower() != 'http':
                realhost = None
            else:
                realhost, rest = splithost(rest)
                if realhost:
                    user_passwd, realhost = splituser(realhost)
                if user_passwd:
                    selector = "%s://%s%s" % (urltype, realhost, rest)
                if proxy_bypass(realhost):
                    host = realhost

            #print "proxy via http:", host, selector
        if not host: raise IOError, ('http error', 'no host given')

        if proxy_passwd:
            proxy_passwd = unquote(proxy_passwd)
            proxy_auth = base64.b64encode(proxy_passwd).strip()
        else:
            proxy_auth = None

        if user_passwd:
            user_passwd = unquote(user_passwd)
            auth = base64.b64encode(user_passwd).strip()
        else:
            auth = None
        h = httplib.HTTP(host)
        if data is not None:
            h.putrequest('POST', selector)
            h.putheader('Content-Type', 'application/x-www-form-urlencoded')
            h.putheader('Content-Length', '%d' % len(data))
        else:
            h.putrequest('GET', selector)
        if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
        if auth: h.putheader('Authorization', 'Basic %s' % auth)
        if realhost: h.putheader('Host', realhost)
        for args in self.addheaders: h.putheader(*args)
        h.endheaders(data)
        errcode, errmsg, headers = h.getreply()
        fp = h.getfile()

红色加粗为核心处理部分。可以看出，这是通过切割host和请求参数后来对服务器发起请求并处理response的过程。

到目前为止,我们可以发现,真正向服务器发起请求的是这一句:h.putrequest('GET', selector).

那么继续追踪定位,

hdr = '%s %s %s' % (method, url, self._http_vsn_str),实际hdr为:'GET /s?wd=python HTTP/1.0',然后输出self._output(hdr),而这个_output的作用是向当前请求缓冲区添加一行输出。然后通过以下方法返回buffer中的内容放置在一个fp的对象里。

        try:
            if not buffering:
                response = self._conn.getresponse()
            else:
                #only add this keyword if non-default for compatibility
                #with other connection classes
                response = self._conn.getresponse(buffering)
        except BadStatusLine, e:
            ### hmm. if getresponse() ever closes the socket on a bad request,
            ### then we are going to have problems with self.sock

            ### should we keep this behavior? do people use it?
            # keep the socket open (as a file), and return it
            self.file = self._conn.sock.makefile('rb', 0)

            # close our socket -- we want to restart after any protocol error
            self.close()

            self.headers = None
            return -1, e.line, None

最后通过一个迭代器不断读回文件内容。

class addbase:
    """Base class for addinfo and addclosehook."""

    def __init__(self, fp):
        self.fp = fp
        self.read = self.fp.read
        self.readline = self.fp.readline
        if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
        if hasattr(self.fp, "fileno"):
            self.fileno = self.fp.fileno
        else:
            self.fileno = lambda: None
        if hasattr(self.fp, "__iter__"):
            self.__iter__ = self.fp.__iter__
            if hasattr(self.fp, "next"):
                self.next = self.fp.next

...