wiki页面文本挖掘

import os,sys
import sys
from bs4 import BeautifulSoup
import urllib.request
# reload(sys)
# sys.setdefaultencoding('utf-8')
BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(BASE_PATH)
DATA_PATH = BASE_PATH + os.path.sep + 'scripts' + os.path.sep
import time,configparser,re
#from http.u_http import HttpClient
#from data.read_wiki import read_wiki
#from data.read_wiki import wiki
import json
#from User import User
#httpClient = HttpClient()
import requests

def login(url,body,url1):
s=requests.post(url=url,data=body,allow_redirects=False)
#print(s.status_code)
JSESSIONID=s.cookies['JSESSIONID']
cks={
'JSESSIONID':JSESSIONID
}
res=requests.get(url1,cookies=cks)
res.encoding='utf-8'
res=res.text
#使用heml解释器进行解析
soup=BeautifulSoup(res,'html.parser')
#在soup中找到为tbody的节点
menu=soup.findAll("tbody")
values=','.join(str(v) for v in menu)
soup2=BeautifulSoup(values,'html.parser')
menu2=soup2.findAll("tr")
values2=','.join(str(v) for v in menu2)
soup3=BeautifulSoup(values2,'html.parser')
menu3=soup3.findAll("td")
values3=','.join(str(v) for v in menu3)
soup4=BeautifulSoup(values3,'html.parser')
soup4=str(soup4)
timeStr = time.strftime("%Y%m%d%H%M%S", time.localtime())
#标题
subject = timeStr
with open(DATA_PATH+subject,"w",encoding='utf-8') as op:
op.write(soup4)
#生成configparser对象
deploy_order = configparser.ConfigParser()
deploy_order.read('online_list.conf')#所有项目清单
#赋值两个空列表
deploy, plan = [], []
plan_order=[]
#得到模块名
for section in deploy_order.sections():
#-options(section) 得到该section的所有option(把模块下的所有所有项目赋值到x
#在加进空列表 )
[deploy.append(x) for x in deploy_order.options(section)]
print(deploy)
#循环deploy_order列表(所有的项目),一个一个取值
for order in deploy:
with open(DATA_PATH+subject,'rb') as op:
data=op.read()
data=data.decode('utf-8')
#print(data)
#data=data.decode('utf-8')
if order in data:
plan.append(order)
else:
print("%s没找到"%order)
#'comx-bs'不发
for line in plan:
if line == 'comx-bs':
plan.remove('comx-bs')
else:
pass
#得到发版项目列表
print(plan)
for plan_name in plan:
with open(DATA_PATH+subject,'rb') as op:
data=op.read()
data=data.decode('utf-8')
res_th = plan_name+'/(.*?)</td>'
m_th = re.findall(res_th,data,re.S)
#print(m_th)
plan_order.append(m_th)
#得到发版项目版本号列表
print(plan_order)
plan_order2=[]
for line in plan_order:
#如果列表有多个参数,就取最新的
if len(line)>1:
plan_order2.append(line[-1])
else:
plan_order2.append(line)
print(plan_order2)
plan_order3=[]
for x in plan_order2:
#将列表准换成字符串
x=''.join(x)
#匹配数字.数字的格式
plan_order3.append(re.findall(r'd.*d.*d',x))
print(plan_order3)
#将列表转换成字符串
plan_order4=[]
for i in plan_order3:
i=''.join(i)
plan_order4.append(i)
print(plan_order4)

def run(pageId):
pageId=str(pageId)
url = "http://wiki.intra.gomeplus.com/pages/viewpage.action?"+'pageId='+pageId
print(url)
url1='http://wiki.intra.gomeplus.com/dologin.action'
body={
'os_username':'wangsen',
'os_password':'WANGs1.',
'login':'登录',
'os_destination':''
}
login(url1,body,url)
# print(value)
# #body如果需要就填上数据如果不需要就置空,body={}
# body = {}
#
# #接口访问的方式 get或post
# u_method = "get"
#
# #处理v2接口需要在header里加“Accept”
# header = {}
# # A451ED019F130356AEF51CB768540B86
# #value = "A3B6F39B9F9925E6BD5D61280A787892"
# header["Content-Type"] = "application/x-www-form-urlencoded"
# header["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"
# cookie='JSESSIONID='+value+'; doc-sidebar=300px;confluence.list.pages.cookie=list-content-tree;
# confluence.browse.space.cookie=space-pages;Hm_lvt_4d914dda44888419a4588c6a4be8edcc=1473650378'
# print(cookie)
# header['Cookie'] =cookie
# # if ApiIsV2(url):
# # header["Content-Type"] = "application/x-www-form-urlencoded"
# # header["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"
#
# #verify为验证项列表,用于检查返回内容中的关键字
# verify = []
#
# postData={}
# #执行被测接口
# result_dict = httpClient.api_verify(url,postData,u_method,header,verify,body)
# print(result_dict)
#接受发版列表
#faban_list=read_wiki(result_dict)





#判断是不是v2接口
# def ApiIsV2(url):
# if "v2" in url:
# return True
# else:
# return False

#前提操作
# def preStep(self):
# "前提操作"
# url = ""
# postData = {}
# u_method = "get"
# header = {}
# verify = []
#
# response = httpClient.api_request(url, postData, u_method, header, verify)
# return response

# #将公参和必填参数组合
# def sign_str(self,data,isV2=False):
# publicParaV1 = {
# "ip":"0.0.0.0",
# "appType":"1",
# "clientOsVersion":"8.4",
# "sortType":"0",
# "pubPlat":"0120102002000000",
# "appVersion":"v1.0.2.33",
# "latitude":"39.964707",
# "otherDevInfo":"otherDevInfo",
# "netType":"3G",
# "numPerPage":"5",
# "devId":"0",
# "clientOs":"1",
# "mac":"00000000",
# "lastRecordId":"0",
# "longitude":"116.47308",
# "pageNum":"1",
# "order":"2",
# "phoneType":"iPhone"
# }
#
# publicParaV2 = {
# "integrity":"full",
# "device":"iOS/9.2.1/iPhone/IPhone12345678",
# "app":"001/1111111111111",
# "appVersion":"1.0.1",
# "net":"",
# "accessToken":"",
# "traceId":"",
# "jsonp":""
# }
# if isV2:
# dictMerged = dict(data, **publicParaV2)
# else:
# dictMerged = dict(data, **publicParaV1)
#
# return dictMerged


if __name__ == "__main__":

pageId=input('wiki_number:')
run(pageId)
原文地址:https://www.cnblogs.com/wangsen-123/p/6030812.html