Python爬虫

from requests import request
resp = request('get', 'http://www.baidu.com')
print resp.content

Python爬虫遇到IOError或连接失败等将headers补充全尝试

爬取贴吧图片

import urllib2
import urllib
import re
import time

def gethtml(url):
    page = urllib2.Request(url)
    html = urllib2.urlopen(page)
    return html.read()

def imgget(html):
    reg = r'src="(.+?.jpg)" size='
    img = re.compile(reg)
    imglist = re.findall(img, html)
    # return imglist

    for imgurl in imglist:
        x = long(time.time()*1000)
        urllib.urlretrieve(imgurl, r'E:MyProcodespidersTestimage\%s.jpg' % x)
for i in range(1, 10):
    time.sleep(1)
    print 'start catch page %d' % i
    html = gethtml("https://tieba.baidu.com/p/4844779320?pn=%d" % i)

    imgget(html)

Python爬取的网页read一次之后再次read会出问题，可将状态恢复至read前的状态

    def getpage(self, pn):
        try:
            url = self.baseurl+self.lzonly+'&pn='+pn.__str__()
            request = urllib2.Request(url)
            response = urllib2.urlopen(request)
            # print response.read()
            return response.read()
        except urllib2.URLError, e:
            if hasattr(e, "reason"):
                print u'连接错误，原因：'+e.reason
                return None

爬取贴吧帖子

# -*- coding: utf-8 -*-
__author__ = 'P00113'

import urllib2
import urllib
import re
import time


# 处理页面标签类
class Tool:
    # 去除img标签,7位长空格
    removeImg = re.compile('<img.*?>| {7}|')
    # 删除超链接标签
    removeAddr = re.compile('<a.*?>|</a>')
    # 把换行的标签换为

    replaceLine = re.compile('<tr>|<div>|</div>|</p>')
    # 将表格制表<td>替换为	
    replaceTD = re.compile('<td>')
    # 把段落开头换为
加空两格
    replacePara = re.compile('<p.*?>')
    # 将换行符或双换行符替换为

    replaceBR = re.compile('<br><br>|<br>')
    # 将其余标签剔除
    removeExtraTag = re.compile('<.*?>')

    def replace(self, x):
        x = re.sub(self.removeImg, "", x)
        x = re.sub(self.removeAddr, "", x)
        x = re.sub(self.replaceLine, "
", x)
        x = re.sub(self.replaceTD, "	", x)
        x = re.sub(self.replacePara, "
    ", x)
        x = re.sub(self.replaceBR, "
", x)
        x = re.sub(self.removeExtraTag, "", x)
        # strip()将前后多余内容删除
        return x.strip()


class NZTB(object):
    def __init__(self, baseurl, lzonly):
        self.baseurl = baseurl
        self.lzonly = '?see_lz=' + lzonly.__str__()
        self.tool = Tool()
    def getpage(self, pn):
        try:
            url = self.baseurl + self.lzonly + '&pn=' + pn.__str__()
            request = urllib2.Request(url)
            response = urllib2.urlopen(request)
            # print response.read()
            return response.read()
        except urllib2.URLError, e:
            if hasattr(e, "reason"):
                print u'连接错误，原因：' + e.reason
                return None

    def gettitle(self):
        html = self.getpage(1)
        # html = '''%s''' % html
        # reg = r'<h3 class="core_title_txt.*?">(.*?)</h3>'
        pat = re.compile('<h3 class="core_title_txt.*?>(.*?)</h3>', re.S)
        res = re.search(pat, html)
        if res:
            # print '%s' % ('-'*100)
            print res.group(1)
            print res.group(1)
            return res.group(1).strip()
        else:
            print '%s' % ('-' * 100)
            return None

    def getpn(self):
        html = self.getpage(1)
        pat = re.compile('<li class="l_reply_num.*?<span.*?>(.*?)</span>', re.S)
        res = re.search(pat, html)
        if res:
            print res.group(1)
            return res.group(1)
        else:
            print '*****'
            return None

    def getcontent(self, pn):
        html = self.getpage(pn)
        pat = re.compile('<div id="post_content_.*?">(.*?)</div>', re.S)  # 匹配楼层正文
        res = re.findall(pat, html)
        # f_pat = re.compile('<div class="post-tail-wrap"><span.*?</span><span.*?</span><span.*?>(.*?)</span>', re.S)
        f_pat = re.compile('<div class="post-tail-wrap"><span.*?>(d*)楼', re.S)  # 匹配楼层
        f_res = re.findall(f_pat, html)
        # for v in f_res:
        #     print v
        for val, f in zip(res, f_res):
            # print val
            v = self.tool.replace(val)
            if v:
                print f, u"楼%s" % ('-' * 100)
                print v, '
'
                # floor += 1
            else:
                continue


if __name__ == '__main__':
    baseurl = 'http://tieba.baidu.com/p/5058456989'
    a = NZTB(baseurl, 0)
    # a.getpage(1)
    for i in range(1, 4):
        a.getcontent(i)

Python连接数据库时出现 UnicodeEncodeError: 'latin-1' codec can't encode character

如下加入几行代码解决

import MySQLdb


db_para = {'host': '10.10.12.171',
           'port': 3306,
           'user': 'root',
           'passwd': 'Hwroot@com',
           'db': 'test'}
dbcon = MySQLdb.connect(**db_para)
cur = dbcon.cursor()
dbcon.set_character_set('utf8')
cur.execute('SET NAMES utf8;')
cur.execute('SET CHARACTER SET utf8;')
cur.execute('SET character_set_connection=utf8;')