python scrapy

import requests
res=requests.get('http://www.baidu.com')
res.encoding='utf-8'
print(res.text)

  

from bs4  import BeatifulSoup
html = """
... <html><head>head title</head><p>history</p></html>"""
soup=BeautifulSoup(html)
print(soup.prettify()) print(soup.select('p')) print(soup.select('p')[0]) print(soup.select('p')[0].text)
print(soup.p)
print(soup.p.attr)
print(soup.find_all('p'))
print(soup.find_all(id='dwww'))

////////////////++++++++++////////
names = soup.find_all('td', class_="job")
re.findAll(">(.{2,5})</a>", names) //正则表达式匹配a链接中任意2到5个字符
soup re组合使用
////////////////++++++++++////////
 

links
=soup.select('p')
for link in links:
  print(link.text)

原文地址:https://www.cnblogs.com/agang-php/p/9685584.html