wiki页面文本挖掘

import os,sys
import sys
from bs4 import BeautifulSoup
import urllib.request
# reload(sys)
# sys.setdefaultencoding('utf-8')
BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(BASE_PATH)
DATA_PATH = BASE_PATH + os.path.sep + 'scripts' + os.path.sep
import time,configparser,re
#from http.u_http import HttpClient
#from data.read_wiki import read_wiki
#from data.read_wiki import wiki
import json
#from User import User
#httpClient = HttpClient()
import requests

def login(url,body,url1):
    s=requests.post(url=url,data=body,allow_redirects=False)
    #print(s.status_code)
    JSESSIONID=s.cookies['JSESSIONID']
    cks={
        'JSESSIONID':JSESSIONID
    }
    res=requests.get(url1,cookies=cks)
    res.encoding='utf-8'
    res=res.text
    #使用heml解释器进行解析
    soup=BeautifulSoup(res,'html.parser')
    #在soup中找到为tbody的节点
    menu=soup.findAll("tbody")
    values=','.join(str(v) for v in menu)
    soup2=BeautifulSoup(values,'html.parser')
    menu2=soup2.findAll("tr")
    values2=','.join(str(v) for v in menu2)
    soup3=BeautifulSoup(values2,'html.parser')
    menu3=soup3.findAll("td")
    values3=','.join(str(v) for v in menu3)
    soup4=BeautifulSoup(values3,'html.parser')
    soup4=str(soup4)
    timeStr = time.strftime("%Y%m%d%H%M%S", time.localtime())
    #标题
    subject = timeStr
    with open(DATA_PATH+subject,"w",encoding='utf-8') as op:
        op.write(soup4)
     #生成configparser对象
    deploy_order = configparser.ConfigParser()
    deploy_order.read('online_list.conf')#所有项目清单
    #赋值两个空列表
    deploy, plan = [], []
    plan_order=[]
    #得到模块名
    for section in deploy_order.sections():
        #-options(section) 得到该section的所有option(把模块下的所有所有项目赋值到x
        #在加进空列表 )
        [deploy.append(x) for x in deploy_order.options(section)]
    print(deploy)
    #循环deploy_order列表(所有的项目)，一个一个取值
    for order in deploy:
        with open(DATA_PATH+subject,'rb') as op:
            data=op.read()
            data=data.decode('utf-8')
            #print(data)
            #data=data.decode('utf-8')
            if order in data:
                plan.append(order)
            else:
                print("%s没找到"%order)
    #'comx-bs'不发
    for line in plan:
        if line == 'comx-bs':
            plan.remove('comx-bs')
        else:
            pass
    #得到发版项目列表
    print(plan)
    for plan_name in plan:
         with open(DATA_PATH+subject,'rb') as op:
            data=op.read()
            data=data.decode('utf-8')
            res_th = plan_name+'/(.*?)</td>'
            m_th = re.findall(res_th,data,re.S)
            #print(m_th)
            plan_order.append(m_th)
    #得到发版项目版本号列表
    print(plan_order)
    plan_order2=[]
    for line in plan_order:
        #如果列表有多个参数，就取最新的
        if len(line)>1:
            plan_order2.append(line[-1])
        else:
            plan_order2.append(line)
    print(plan_order2)
    plan_order3=[]
    for x in plan_order2:
        #将列表准换成字符串
        x=''.join(x)
        #匹配数字.数字的格式
        plan_order3.append(re.findall(r'd.*d.*d',x))
    print(plan_order3)
    #将列表转换成字符串
    plan_order4=[]
    for i in plan_order3:
        i=''.join(i)
        plan_order4.append(i)
    print(plan_order4)

def run(pageId):
    pageId=str(pageId)
    url = "http://wiki.intra.gomeplus.com/pages/viewpage.action?"+'pageId='+pageId
    print(url)
    url1='http://wiki.intra.gomeplus.com/dologin.action'
    body={
        'os_username':'wangsen',
        'os_password':'WANGs1.',
        'login':'登录',
        'os_destination':''
    }
    login(url1,body,url)
    # print(value)
    # #body如果需要就填上数据如果不需要就置空，body={}
    # body = {}
    #
    # #接口访问的方式 get或post
    # u_method = "get"
    #
    # #处理v2接口需要在header里加“Accept”
    # header = {}
    # # A451ED019F130356AEF51CB768540B86
    # #value = "A3B6F39B9F9925E6BD5D61280A787892"
    # header["Content-Type"] = "application/x-www-form-urlencoded"
    # header["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"
    # cookie='JSESSIONID='+value+'; doc-sidebar=300px;confluence.list.pages.cookie=list-content-tree; 
    # confluence.browse.space.cookie=space-pages;Hm_lvt_4d914dda44888419a4588c6a4be8edcc=1473650378'
    # print(cookie)
    # header['Cookie'] =cookie
    # # if ApiIsV2(url):
    # #     header["Content-Type"] = "application/x-www-form-urlencoded"
    # #     header["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"
    #
    # #verify为验证项列表，用于检查返回内容中的关键字
    # verify = []
    #
    # postData={}
    # #执行被测接口
    # result_dict = httpClient.api_verify(url,postData,u_method,header,verify,body)
    # print(result_dict)
    #接受发版列表
    #faban_list=read_wiki(result_dict)





#判断是不是v2接口
# def ApiIsV2(url):
#     if "v2" in url:
#         return True
#     else:
#         return False

#前提操作
# def preStep(self):
#     "前提操作"
#     url = ""
#     postData = {}
#     u_method = "get"
#     header = {}
#     verify = []
#
#     response = httpClient.api_request(url, postData, u_method, header, verify)
#     return response

    # #将公参和必填参数组合
    # def sign_str(self,data,isV2=False):
    #     publicParaV1 = {
    #                "ip":"0.0.0.0",
    #                "appType":"1",
    #                "clientOsVersion":"8.4",
    #                "sortType":"0",
    #                "pubPlat":"0120102002000000",
    #                "appVersion":"v1.0.2.33",
    #                "latitude":"39.964707",
    #                "otherDevInfo":"otherDevInfo",
    #                "netType":"3G",
    #                "numPerPage":"5",
    #                "devId":"0",
    #                "clientOs":"1",
    #                "mac":"00000000",
    #                "lastRecordId":"0",
    #                "longitude":"116.47308",
    #                "pageNum":"1",
    #                "order":"2",
    #                "phoneType":"iPhone"
    #                }
    #
    #     publicParaV2 = {
    #                 "integrity":"full",
    #                 "device":"iOS/9.2.1/iPhone/IPhone12345678",
    #                 "app":"001/1111111111111",
    #                 "appVersion":"1.0.1",
    #                 "net":"",
    #                 "accessToken":"",
    #                 "traceId":"",
    #                 "jsonp":""
    #                 }
    #     if isV2:
    #         dictMerged = dict(data, **publicParaV2)
    #     else:
    #         dictMerged = dict(data, **publicParaV1)
    #
    #     return dictMerged


if __name__ == "__main__":

    pageId=input('wiki_number:')
    run(pageId)