python- www.thisamericanlife.org转pdf

环境安装

pip install  requests
pip install  beautifulsoup4
pip install  pdfkit


$ sudo apt-get install wkhtmltopdf  # ubuntu
$ sudo yum intsall wkhtmltopdf      # centos

脚本

#!/usr/bin/env python3.5
# -*- coding: utf-8 -*-
# @Time    : 2019/11/18 下午10:48
# @Author  : yon
# @Email   : xxx@qq.com
# @File    : day1.py.py

import os
import re
import time
import logging
import pdfkit
from bs4 import BeautifulSoup
import requests


headers = {
    # 'Accept': 'application/json, text/javascript, */*; q=0.01',
    # 'Accept': '*/*',
    # 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-US;q=0.7',
    # 'Cache-Control': 'no-cache',
    # 'accept-encoding': 'gzip, deflate, br',
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',
    'Referer': 'https://www.google.com/'
}
options= {
    'page-size': 'Letter',
    'encoding': "UTF-8",
    'custom-header': [
        ('Accept-Encoding', 'gzip')
    ]
}

resp = requests.get('https://www.thisamericanlife.org/687/transcript', headers=headers)

soup = BeautifulSoup(resp.content, "html.parser")
body = soup.find("article")
all1 = str(body)
pdfkit.from_string(all1, "/home/yon/Desktop/tt.pdf")

另外一种写法

import os
import re
import time
import logging
import requests
import urllib.request
import os
import stat
import pdfkit
from bs4 import BeautifulSoup

# headers = {
#     # 'Accept': 'application/json, text/javascript, */*; q=0.01',
#     'Accept': '*/*',
#     'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-US;q=0.7',
#     'Cache-Control': 'no-cache',
#     'accept-encoding': 'gzip, deflate, br',
#     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',
#     'Referer': 'https://www.google.com/'
# }
#
#
# resp = requests.get('https://www.thisamericanlife.org/687/transcript', headers=headers)
#
# html = resp.content
# with open("thisaericanlife.html", 'wb') as f:
#     f.write(html)

soup = BeautifulSoup(open("thisaericanlife.html"), "html.parser")
print(soup.article.contents)
print("类型")

html = ""
for x in soup.article.contents:
    # print(str(x))
    html += str(x)

print(html)


# html = BeautifulSoup(soup.article.contents)
#print(type(html))
# print(html)
pdfkit.from_string(html, "/home/baixiaoxu/desk/tt.pdf")