用Python实现的一个简单的爬取省市乡镇的行政区划信息的脚本

# coding=utf-8
# Creeper
import os
import bs4
import time
import MySQLdb
import urllib2
import datetime
import warnings
import traceback
import ConfigParser

try:
    basedir = os.path.dirname(os.path.abspath(__file__))
except NameError:
    import sys
    basedir = os.path.dirname(os.path.abspath(sys.argv[0]))

SETTINGS_FILE = os.path.join(basedir, 'settings.ini')
GLOBAL_CONFIG = {
    'server': {
        'debug': False,
    },
    'db': {
        'host': '127.0.0.1',
        'port': '3306',
        'user': 'root',
        'password': '',
        'dbname': 'test',
        'table': 'group'
    }
}

def __config(item):
    GLOBAL_CONFIG[sec][item[0]] = item[1]
try:
    parser = ConfigParser.ConfigParser()
    parser.readfp(open(SETTINGS_FILE))
    for sec in parser.sections():
        map(__config, parser.items(sec))
except:
    print 'settings.ini needed'
    raise

N = 0

class Handle(object):

    def __init__(self, *args, **kwargs):
        super(Handle, self).__init__()
        self.db = {}
        self.conn = None
        self.cursor = None
        self._cursor(**kwargs)

    def _cursor(self, **kwargs):
        self.db.update(**kwargs)
        host = kwargs.get('host', '127.0.0.1')
        port = int(kwargs.get('port', 3306))
        user = kwargs.get('user', 'root')
        pwd = kwargs.get('password', '')
        dbname = kwargs.get('dbname', 'test')
        charset = kwargs.get('charset', 'utf8')
        _conn = MySQLdb.connect(user=user, passwd=pwd,
                host=host, port=port, charset=charset)
        try:
            _conn.select_db(dbname)
        except:
            sql = """CREATE DATABASE IF NOT EXISTS `%s` DEFAULT CHARSET utf8 COLLATE utf8_general_ci;
                  """ % dbname
            _conn.cursor().execute(sql)
            _conn.select_db(dbname)
        self.conn = _conn
        self.cursor = _conn.cursor()

    def format_pk(self, pk, index):
        d = {1:2, 2:4, 3:6, 4:9, 5:12}
        try:
            pk = int(float(pk))
        except:
            raise ValueError, 'the primary key must be integer or string interger'
        return int(str(pk)[:d[index]])
        while (pk * 1.0 / 10).is_integer():
            pk = pk * 1.0 / 10
        return pk < 10 and int(pk) * 10 or int(pk)

    def do_execute(self, pk, name, type, parent='NULL'):
        global N
        now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S %f')
        if N%100 == 0 and N != 0:
            print 'Total: %s ---%s' % (N, now)
            # time.sleep(5)
        if not parent or parent == '':
            parent = 'NULL'
        sql = """INSERT INTO `%s` (`id`, `name`, `type`, `parent`) VALUES(%s, '%s', '%s', %s);
              """ % (self.db['table'], pk, name, type, parent)
        try:
            self.cursor.execute(sql)
            self.conn.commit()
            print '+',
            N += 1
        except MySQLdb.Warning, w:
            print "
Warning:%s" % str(w)
            print '#',
        except MySQLdb.Error, e:
            if not 'Duplicate entry' in str(e):
                print "
Error:%s" % str(e)
                self.debug(pk, name)
                self.debug(pk, parent)
            else:
                print '=',
                self.debug(pk, name)
        except:
            traceback.print_exc()
            print '?',

    def do_executemany(self, items, params=None):
        sql = """INSERT INTO `%s` (`id`, `name`, `type`, `parent`) VALUES """ % self.db['table']
        sql +="(%s, %s, %s, %s)";
        try:
            self.cursor.executemany(sql, items)
            self.conn.commit()
        except MySQLdb.Error, e:
            print "Error:%s" % str(e)
        except:
            traceback.print_exc()


class Creeper(Handle):

    def __init__(self, *args, **kwargs):
        Handle.__init__(self, *args, **kwargs)
        self.init_db()
        self.root_url = ''
        self._type = {1:'province',2:'city',3:'county',4:'town',5:'village'}

    def init_db(self):
        __sql = """CREATE TABLE IF NOT EXISTS `%s` (
          `id` bigint(20) NOT NULL,
          `name` varchar(30) NOT NULL,
          `type` varchar(30) NOT NULL,
          `parent` bigint(20) DEFAULT NULL,

          PRIMARY KEY (`id`),
          KEY `Group_12345` (`name`),
          KEY `Group_67890` (`parent`),
          FOREIGN KEY(parent) REFERENCES `%s` (`id`)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
        """ % (self.db['table'], self.db['table'])

        try:
            self.cursor.execute(__sql)
            self.cursor.execute("""CREATE TABLE IF NOT EXISTS `debug` (
                  `id` int(11) NOT NULL AUTO_INCREMENT,
                  `badid` bigint(20) NOT NULL,
                  `others` varchar(256) DEFAULT NULL,
                  PRIMARY KEY (`id`)
                ) ENGINE=InnoDB DEFAULT CHARSET=utf8;""")
            self.conn.commit()
        except MySQLdb.Warning, w:
            if not 'already exists' in str(w):
                print "Warning:%s" % str(w)
        except MySQLdb.Error, e:
            print "Error:%s" % str(e)
            traceback.print_exc()
        except:
            traceback.print_exc()
            pass

    def debug(self, *args):
        try:
            _sql = """INSERT INTO `debug` (`badid`, `others`) VALUES(%s, '%s');
                   """ % (args[0], args[1])
            self.cursor.execute(_sql)
            self.conn.commit()
        except:
            traceback.print_exc()
        finally:
            return


    def __get_url(self, tup, index):
        id = tup[0]
        parent = tup[-1]
        __url = []
        for i in range(0, index, 2):
            __url.append(str(id)[i: i+2])
        __url.append(str(id))
        return self.root_url + '/'.join(__url) + '.html'

    def format_tag(self, x_tag, url, index, limit=None):
        if limit and index > limit: return
        parent = str(url.split('/2013/')[1].split('/')[-1])[:-5]
        __type = self._type[index]
        __parent = not parent and 'NULL' or parent
        if isinstance(x_tag, bs4.element.Tag) and x_tag.has_attr('href'):
            print '.',
            href = x_tag['href']
            child_url = '/'.join(url.split('/')[:-1])
            full_url = '/'.join([child_url, href])
            __pk = self.format_pk(href.split('.html')[0].split('/')[-1], index)
            __name = x_tag.text
            i = (__pk, __name, __type, __parent)
            self.do_execute(*i)
            self.get_info(full_url, index + 1, limit)
        else:
            print '*',
            __pk = self.format_pk(x_tag[0].text, index)
            __name = x_tag[1].text
            self.do_execute(__pk, __name, __type, __parent)

    def get_info(self, url, index, limit=None):
        # 解析页面,获取目标区域的数据
        # 获取单元数据,格式化为可供插入数据库的元组
        try:
            __html = urllib2.urlopen(url).read()
            __soup = bs4.BeautifulSoup(__html, from_encoding='gbk')
            __tr = __soup('tr', class_='%str' % self._type[index])
        except:
            try:
                self.debug(0, url)
            except:
                traceback.print_exc()
            finally:
                return
        __lst = []
        for tr in __tr:
            # 每个tr中的多个td代表多个省
            if index == 1:
                for td in tr('td'):
                    __lst.extend(td('a'))
                continue

            # 每个tr中的多个td代表一个节点,取最后一个td中的a标签
            if tr('td')[-1]('a'):
                __lst.extend(tr('td')[-1]('a'))
            else:
            # 没有子节点的元素,单纯的通过td中的数据创建
                self.format_tag([tr('td')[0], tr('td')[-1]], url, index, limit)

        for a in __lst:
            self.format_tag(a, url, index, limit)


    def do_get_childs(self, index):
        # 1. 查询获取当前层级节点数目
        _sql = """SELECT COUNT( * ) FROM  `%s` WHERE `type`='%s';
               """ % (self.db['table'], self._type[index])
        self.cursor.execute(_sql)
        total = self.cursor.fetchone()[0]

        # 2. 遍历父节点,获取子节点数据
        for i in range(0, total, 30):
            _sql = """SELECT `id`, `parent` FROM `%s` WHERE `type` = '%s' LIMIT %s, %s;
                   """ % (self.db['table'], self._type[index], i, i+30)
            self.cursor.execute(_sql)
            id_and_parent = list(set(self.cursor.fetchall()))
            for tup in id_and_parent:
                url = self.__get_url(tup, index)
                self.get_info(url, index+1)


warnings.filterwarnings('error', category = MySQLdb.Warning)
del warnings

if __name__ == '__main__':
    url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2013/"
    db_config = GLOBAL_CONFIG['db']
    g = Creeper(**db_config)
    g.root_url = url
    g.init_db()
    lst = g.get_info(url, index=1, limit=5)
    # g.do_get_childs(3)