scrapy 抓取拉勾网数据

其实很简单，却因为一些小问题，折腾不少时间，简要记录一下，以备后需。

>> scrapy startproject lagou
>> cd lagou
>> scrapy gen lagou_jd www.lagou.com

定义item

在items.py中继续完善定义：

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class LagouItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    job_title = scrapy.Field()
    job_description = scrapy.Field()
    job_url = scrapy.Field()

完善爬虫

# -*- coding: utf-8 -*-
from scrapy.selector import Selector
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from lagou.items import LagouItem
import codecs,re
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

class LagoujdSpider(CrawlSpider):
    name = "lagoujd"
    allowed_domains = ["lagou.com"]
    start_urls = (
        'http://www.lagou.com/jobs/787409.html',
    )

    rules = [
        Rule(SgmlLinkExtractor(allow =r'jobs/d+.html'),callback = 'parse_lagou',follow=True),
    ]
    


    def parse_lagou(self, response): # 主要改名，不能使用默认的parse！
        self.SPLIT_DEMAND = re.compile(u'(要求|资格|条件)[:：;
]?')
        self.SPLIT_LINE = re.compile(u'[;；。
]')
        self.DEMAND = re.compile(u'具备|熟悉|具有|熟练|掌握|良好的|能够|丰富的|以上学历|优秀的|有深入研究|有很强的|工作
经历|工作经验|善于|懂得|优先|不少于|不超过|喜欢|较强的.{2,8}能力|相关专业|相关学历|开发经验|实习经验|d年以上')

        item = LagouItem()
        sel = Selector(response)
        try:
            item["job_title"] =sel.xpath("//title/text()").extract()[0].split('-')[0][:-2].strip()
            job_des = sel.xpath('//*[@id="container"]/div[1]/div[1]/dl[1]/dd[2]').extract()[0]
            job_des = BeautifulSoup(job_des).get_text()
            item["job_description"] = self.get_demand(job_des)
            item["job_url"] = response.url
            print item['job_title']
        except Exception,e:
            print e
       # if item.has_key("job_title") and item.has_key("job_description"):
       #     with codecs.open("./output/"+item["job_title"].strip()+".txt",'a','utf-8') as fw:
       #         fw.write(item["job_description"])
       #         print item["job_title"],"done"
        
        
        return item

    def get_demand(self,jdstr):

        res = []
        if self.SPLIT_DEMAND.search(jdstr):
            pos = self.SPLIT_DEMAND.search(jdstr).span()[1]
            linelist =self.SPLIT_LINE.split(jdstr[pos:])
            for line in linelist:
                if len(line)<5:continue
                if re.match('d',line.strip()):
                    res.append(line)
                elif self.DEMAND.search(line):
                    res.append(line)
                else:
                    break
        return '
'.join(res)

存储抓取的数据为json格式

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

import json
import codecs

class LagouPipeline(object):
    def __init__(self):
        self.file = codecs.open('lagou_jd.json','w',encoding='utf-8')


    def process_item(self, item, spider):
        line = json.dumps(dict(item),ensure_ascii=False)+'
'
        self.file.write(line)
        return item
    

    def spider_closed(self,spider):
        self.file.close()

在setttings.py 中注册pipline

# -*- coding: utf-8 -*-

# Scrapy settings for lagou project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#

BOT_NAME = 'lagou'

SPIDER_MODULES = ['lagou.spiders']
NEWSPIDER_MODULE = 'lagou.spiders'

ITEM_PIPELINES = {
    'lagou.pipelines.LagouPipeline':300,
}

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'lagou (+http://www.yourdomain.com)'

运行，各种抓！！！

>> scrapy crawl lagou_jd
或者 
>> scrapy crawl lagou_jd -o item.json -t json

demo:

jkmiao@jkmiao-ipin:~/workplace/spiders/lagou$ more lagou_jd.json 
{"job_url": "http://www.lagou.com/jobs/1102051.html", "job_description": "1、具有2年以上互联网产品经验，优秀的交互设计
能力，对产品设计有极高的要求，追求极致的用户体验 2、善于观察与学习，具有宽广的视野、对于整体产品规划有自己的见解和理念 
3、有优秀缜密的逻辑与思维能力，良好的协调能力、分析、计划及项目管理能力，具备良好的团队精神，沟通能力强 4、熟练使用 Axu
re 、 visio 、 office 等软件
 5、有成熟的O2O平台类产品设计经验者优先", "job_title": "产品经理"}
{"job_url": "http://www.lagou.com/jobs/917776.html", "job_description": "1、有2年以上互联网产品规划和体验设计相关经验,
熟悉互联网或软件产品整体实现过程，包括从需求分析到产品发布
2、有完整策划至少2个以上成功、目前尚在运营中的互联网产品设
计案例
3、能通过数据分析等系统性方法深刻理解用户需求并予以满足
4、执行力强，善于组织协调并推动项目进展
5、对工作充满
热情，富有创新精神，能承受较大的工作压力
6、有良好的学习能力、良好的沟通能力和团队合作精神，出色的组织能力", "job_titl
e": "产品经理"}

新建脚本文件preprocess.py，进一步预处理

#!/usr/bin/env python
# coding=utf-8

import simplejson as json
import re
import sys,codecs
from collections import defaultdict
reload(sys)
sys.setdefaultencoding('utf-8')
from simhash import Simhash



def get_top_jobname(jobname,namelist):
    namelist = sorted(namelist)
    dis = [ (Simhash(jobname).distance(Simhash(other)),other) for other in namelist ]
    dis = sorted(dis,key=lambda x:x[0])
    return dis[0]


def clean_text(fname='./lagou_jd.json'):
    SPLIT_LINE = re.compile(u'[;；。
]')
    FILTER_DEMAND = re.compile(u'薪酬|待遇|待遇|福利|加入我们|职责|你|成为')
    res = defaultdict(str)
   # fw1 = codecs.open('demands.txt','w','utf-8')
   # fw2 = codecs.open('duty.txt','w','utf-8')
    i=1
    for line in codecs.open(fname):
        jd = json.loads(line)
        if not re.match(u'd',jd['job_demand'].strip()) or len(jd["job_demand"])<8 or len(
jd["job_title"])<2:continue
        if FILTER_DEMAND.search(jd['job_demand']):continue
        
        if len(res.keys())>0:
            top_job = get_top_jobname(jd['job_title'],res.keys())
        else:
            top_job = tuple((0,jd['job_title']))

        if top_job[0]<7: # 如果距离<，就归并为一个职业
            if top_job[0]>4:
                print top_job[0],top_job[1],jd['job_title']
            jd['job_title'] =  top_job[1]


        jd["job_demand"] = re.sub(ur"xa0","",jd["job_demand"].decode('utf-8'))
      # jd["job_duty"] = re.sub(ur"xa0","",jd["job_duty"].decode('utf-8'))
        jd["sum_request"] = re.sub(ur"xa0|s+"," ",jd["sum_request"].decode('utf-8'))

        demand = [ x.strip() for x in jd['job_demand'].split() if len(x)>5]
        if len(demand)<3: continue
       # duty = [x.strip() for x in jd['job_duty'].split() if len(x)>5]
        sum_request = [ x.strip() for x in jd['sum_request'].split() if len(x)>3 ]
        

        jd['job_demand'] = '
'.join(demand)
    #   jd['job_duty'] = '
'.join(duty)
        
    #    fw1.write('
'.join(demand)+'
')
    #    fw2.write('
'.join(duty)+'
')
       

        if not res.has_key(jd["job_title"]):
            res[jd["job_title"]] = ' '.join(sum_request)+'
'+jd["job_demand"].strip()
        else:
            res[jd['job_title']] += '
'+'
'.join(SPLIT_LINE.split(jd['job_demand']))
        i += 1
        if i%100==0:
            print i
    print i,"done"
    print len(res)
    json.dump(res,codecs.open('./lagou_jd_clean.json','w','utf-8'))


def get_jds(fname='./lagou_jd_clean.json'):
    res = json.load(codecs.open(fname))
    i = 1
    for k,v in res.iteritems():
        if len(v.split())<16:
            print i,k
            print v
            print "
============
"
            i += 1
            if i>20:
                break


if __name__ == "__main__":
    clean_text()

每天一小步，人生一大步！Good luck~