学习pyspider两篇记录,python爬虫

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-11-08 09:56:40
# Project: product

from pyspider.libs.base_handler import *
import re
import base64
import os
import urllib
import urllib.request
import requests
import json


class Handler(BaseHandler):

def default(self, obj):
if isinstance(obj, bytes):
return str(obj, encoding='utf-8')
return json.JSONEncoder.default(self, obj)

crawl_config = {
}

@every(minutes=24 * 60)
def on_start(self):
self.crawl('http://www.yunjinet.com/sell/list/7934/', callback=self.index_page)

@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('a[href^="http"]').items():
if re.match('http://www.yunjinet.com/sell/show.+',each.attr.href):
self.crawl(each.attr.href, callback=self.detail_page)
else:
self.crawl(each.attr.href, callback=self.index_page)


@config(priority=2)
def detail_page(self, response):
image_url_list=[]#图片url
tags=[]#分类标签
images=[]#图片base64
x=0
imageresult=[]#放图片对象
results=[]#最终结果,全部json放里
result=dict()#放json
headers = {"Content-Type": "application/json"}
path='D:\pythonlianxi\testimg'

if not os.path.isdir(path):
os.makedirs(path)
paths = path+'\'

for img in response.doc('div[class="vertical-img"] img').items():
image_url_list.append(img.attr.src)
urllib.request.urlretrieve(img.attr.src,'{0}{1}.jpg'.format(paths,x))
#print(paths+str(x))
with open(paths+str(x)+".jpg","rb") as f:
base64_data = base64.b64encode(f.read())
base64_data = (base64_data).decode()
#print((base64_data).decode())
#print("".join(map(chr, base64_data)))
images.append(base64_data)
imgurl=dict()#放base64
imgurl['imgBase64']=base64_data
imageresult.append(imgurl)
x = x + 1


for each in response.doc('div[class="location_an mt_10"]').items('a'):
tags.append(each.text())



pricebefore=response.doc('p[class="s"]').text()
findlist = re.findall('[0-9]*.?[0-9]+', pricebefore)
if not len(findlist):
findlist=[0]
print(findlist[0])
result['originalLink']=response.url
result['productName']=response.doc('h1').text()
result['price']=findlist[0]
result['productDescription']=response.doc('div[class="product_content"]').text()
result['category1']=tags[2]
result['category2']=tags[3]
result['category3']=tags[4]
result['images']=imageresult

results.append(result)
print(result)

#payload=json.dumps(result)
payload=json.dumps(result)
r = requests.post('http://192.168.1.115/es/index/product', data=payload, headers=headers)

return {
"originalLink": response.url,
"productName": response.doc('h1').text(),
"price": response.doc('p[class="s"]').text(),
"productDescription":response.doc('div[class="product_content"]').text(),
"category1":tags[2],
"category2":tags[3],
"category3":tags[4],
"images":images,
}

-------------------------------

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-11-08 09:56:40
# Project: product

from pyspider.libs.base_handler import *
import re
import base64
import os
import urllib
import urllib.request
import requests
import json


class Handler(BaseHandler):

def default(self, obj):
if isinstance(obj, bytes):
return str(obj, encoding='utf-8')
return json.JSONEncoder.default(self, obj)

crawl_config = {
}

@every(minutes=24 * 60)
def on_start(self):
self.crawl('https://product.suning.com/0000000000/10629204175.html#?safp=d488778a_10004_0_daa73474ac', callback=self.index_page, validate_cert=False)

@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('a[href^="http"]').items():
if re.match('https://product.suning.com/+',each.attr.href):
self.crawl(each.attr.href, callback=self.detail_page, validate_cert=False)
else:
self.crawl(each.attr.href, callback=self.index_page, validate_cert=False)


@config(priority=2)
def detail_page(self, response):
image_url_list=[]#图片url
tags=[]#分类标签
images=[]#图片base64
x=0
imageresult=[]#放图片对象
results=[]#最终结果,全部json放里
result=dict()#放json
#headers = {"Content-Type": "application/json"}
path='D:\pythonlianxi\testimg'

if not os.path.isdir(path):
os.makedirs(path)
paths = path+'\'

for img in response.doc('div[moduleId="R1901001_3"]').items('img'):

if re.match('http.+',img.attr.src2):
imgurl = img.attr.src2
else:
imgurl = 'https://'+img.attr.src2
# image_url_list.append(img.attr.src)
urllib.request.urlretrieve(imgurl,'{0}{1}.jpg'.format(paths,x))
with open(paths+str(x)+".jpg","rb") as f:
base64_data = base64.b64encode(f.read())
base64_data = (base64_data).decode()
# #print((base64_data).decode())
# #print("".join(map(chr, base64_data)))
#images.append(base64_data)
imgurl=dict()#放base64
imgurl['imgBase64']=base64_data
imageresult.append(imgurl)
x = x + 1


for each in response.doc('a[class="ft"]').items():
tags.append(each.text())

#pricebefore=response.doc('p[class="s"]').text()
#findlist = re.findall('[0-9]*.?[0-9]+', pricebefore)
#print(findlist[0])
#if not len(findlist):
# findlist=[0]


result['originalLink']=response.url
result['productName']=response.doc('h1').text()
result['price']=3000
result['productDescription']=response.doc('meta[name="description"]').attr.content
result['category1']=tags[0]
result['category2']=tags[1]
result['category3']=tags[2]
result['images']=imageresult

#results.append(result)
#print(result)

#payload=json.dumps(result)
#r = requests.post('http://192.168.1.115/es/index/product', data=payload, headers=headers)

return {
"originalLink": response.url,
"productName": response.doc('h1').text(),
#"price": response.doc('p[class="s"]').text(),
"productDescription":response.doc('meta[name="description"]').attr.content,
"category1":tags[0],
"category2":tags[1],
"category3":tags[2],
"images":imageresult,
}

原文地址:https://www.cnblogs.com/lely/p/9936455.html