爬取智联招聘信息

import scrapy
from jobspider.items import JobspiderItem
import logging

class JobSpider(scrapy.Spider):

    name = "job_spider"

    start_urls = [
        "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&kw=java&isadv=0&sg=df4b40a6bfaf49c08ef0cb9e8e2181f2&p=1"
    ]

    def parse(self, response):

        # save html file.
        # filename = 'zhaopin.html'
        # with open(filename, 'wb') as f:
        #     f.write(response.body)
        # self.log('Saved file %s' % filename)

        jobs = response.xpath('//div[@id="newlist_list_content_table"]/table[@class="newlist"]')
        for job in jobs[1:]:
            item = JobspiderItem()

            item['jobname'] = ''.join(job.xpath('.//td[@class="zwmc"]/div/a//text()').extract())
            item['companyname'] = job.xpath('.//td[@class="gsmc"]/a/text()').extract_first()
            item['salary'] = job.xpath('.//td[@class="zwyx"]/text()').extract_first()
            item['workplace'] = job.xpath('.//td[@class="gzdd"]/text()').extract_first()

            yield item

爬取智联招聘。

智联html结构:

<div class="newlist_list_content" id="newlist_list_content_table">
	<table class="newlist" width="853" cellspacing="0" cellpadding="0">
		<tr>
			<td class="zwmc" style=" 250px;">
				<input name="vacancyid" data-monitor="CZ751712970J00017764214|3" value="CZ751712970J00017764214_719_1_03_409__1_" onclick="zlapply.uncheckAll('allvacancyid')" type="checkbox">
				<div style=" 224px;* 218px; _200px; float: left">
					<a style="font-weight: bold" par="ssidkey=y&ss=409&ff=03&sg=df4b40a6bfaf49c08ef0cb9e8e2181f2&so=3" href="http://jobs.zhaopin.com/CZ751712970J00017764214.htm" target="_blank"><b>java</b>开发工程师 </a><a href="http://e.zhaopin.com/products/1/detail.do" target="_blank" title="点击“顶”字,了解更多"><img src="/assets/images/top.png" border="0" align="absmiddle"> <img src="/assets/images/jp.gif" border="0" align="absmiddle"></a>
				</div>
			</td>
			<td style=" 60px;" class="fk_lv"><span>64%</span></td>
			<td class="gsmc"><a href="http://company.zhaopin.com/CZ751712970.htm" target="_blank">北京中科网联信息技术研究院(有限合伙)</a> <a href="http://company.zhaopin.com/CZ751712970.htm" target="_blank" style="vertical-align: top;"><img src="//img03.zhaopin.cn/IHRNB/img/souvip1002.png" alt="1002" class="icon_vip" border="0" align="absmiddle"></a></td>
			<td class="zwyx">4001-6000</td>
			<td class="gzdd">郑州</td>
			<td class="gxsj"><span>置顶</span><a class="newlist_list_xlbtn" href="javascript:;"></a></td>
		</tr>
		<tr style="display: none" class="newlist_tr_detail">
			<td style="line-height: 0;" colspan="6" width="833px">
				<div class="newlist_detail">
					<div class="clearfix">
						<ul>
							<li class="newlist_deatil_two"><span>地点:郑州</span><span>公司性质:民营</span><span>经验:1-3年</span><span>学历:不限</span><span>职位月薪:4001-6000元/月</span></li><li class="newlist_deatil_last">...<b>Java</b>开发经验,熟悉J2EE体系结构,并能熟悉掌握SSH等开源框架;  3. 能熟练掌握和开发Web Service、SOAP、Socket、NIO等开发技术,对http、tcp、udp协议有一定的了解;  4. 精通Ajax、<b>Java</b>Script、HTML5等前...</li>
							
						</ul>
						<dl>
							<dt>
								<a href="javascript:zlapply.searchjob.ajaxApplyBrig1('CZ751712970J00017764214_719','ssi','_1_03_409__2_');searchMonitor.logSingleApplyData('CZ751712970J00017764214|3');">
									<img src="/assets/images/newlist_sqimg_03.jpg">
								</a>
							</dt>
							<dd><a href="javascript:zlapply.searchjob.saveOne('CZ751712970J00017764214_719');"><img src="/assets/images/newlist_scimg_06.jpg"></a></dd>
						</dl>
					</div>
				</div>
		</td></tr>
	</table>
</div>
原文地址:https://www.cnblogs.com/hfultrastrong/p/8023589.html