python论文爬取（四）

app.py

from flask import Flask, jsonify, render_template, request, json

import mysqlUtil

app = Flask(__name__)


@app.route("/c1")
def hellv():
    return render_template('view.html')

@app.route("/text")
def hellt():
    return render_template('text.html')

@app.route("/findlunwen")
def hello():
    return


@app.route("/")
def hellp():
    return render_template('find.html')


@app.route("/c2", methods=['POST', 'GET'])
def wordcloud():
    res0 = []
    res1 = []
    temp = mysqlUtil.select_key()
    print(temp)
    for i in temp:
        res0.append(i[0])  # keyword
        res1.append(i[1])  # value
    return jsonify({"keyword": res0, "value": res1})


@app.route("/c3", methods=['POST', 'GET'])
def select_lunwen():
    res0 = []
    res1 = []
    res2 = []
    res3 = []
    tiaojian = request.args.get("tiaojian")
    firinput = request.args.get("firinput")
    jingzhun = request.args.get("jingzhun")
    # print(tiaojian)
    # print(firinput)
    # print(jingzhun)
    if jingzhun == '精准':
        if tiaojian == '题目':
            temp = mysqlUtil.select_lunwenj('title', firinput)
        elif tiaojian == '摘要':
            temp = mysqlUtil.select_lunwenj('abstract', firinput)
        elif tiaojian == '作者':
            temp = mysqlUtil.select_lunwenj('zuozhe', firinput)
        elif tiaojian == '关键词':
            temp = mysqlUtil.select_lunwenj('abstract', firinput)

    else:
        if tiaojian == '题目':
            temp = mysqlUtil.select_lunwenm('title', firinput)
        elif tiaojian == '摘要':
            temp = mysqlUtil.select_lunwenm('abstract', firinput)
        elif tiaojian == '作者':
            temp = mysqlUtil.select_lunwenm('zuozhe', firinput)
        elif tiaojian == '关键词':
            temp = mysqlUtil.select_lunwenm('abstract', firinput)
    for i in temp:
        res0.append(i[0])  # title
        res1.append(i[1])  # link
        res2.append(i[3])  # zuozhe
        res3.append(i[4])  # time
    qw = jsonify({"title": res0, "zuozhe": res2, "time": res3, "lianjie": res1})
    return jsonify({"title": res0, "zuozhe": res2, "time": res3, "lianjie": res1})




if __name__ == '__main__':
    app.run(debug=True, host='127.0.0.1', port='5000')

Keyword.py

# -*- coding: utf-8 -*-
import sys

sys.path.append('../')

import jieba
import jieba.analyse
import mysqlUtil
from optparse import OptionParser

# file_name = "test.txt"
#
# content = open(file_name, 'rb').read()
# content = "Few-shot learning is an important area of research.  Conceptually, humans are readily able to understand new concepts given just a few examples, while in more pragmatic terms, limited-example training situations are common practice. Recent effective approaches to few-shot learning employ a metric-learning framework to learn a feature similarity comparison between a query (test) example, and the few support (training) examples.  However, these approaches treat each support class independently from one another, never looking at the entire task as a whole.  Because of this, they are constrained to use a single set of features for all possible test-time tasks, which hinders the ability to distinguish the most relevant dimensions for the task at hand.  In this work, we introduce a Category Traversal Module that can be inserted as a plug-and-play module into most metric-learning based few-shot learners.  This component traverses across the entire support set at once, identifying task-relevant features based on both intra-class commonality and inter-class uniqueness in the feature space.  Incorporating our module improves performance considerably (5%-10% relative) over baseline systems on both miniImageNet and tieredImageNet benchmarks, with overall performance competitive with the most recent state-of-the-art systems."
# 10表示输出的前10个
# tags = jieba.analyse.extract_tags(content, topK=10, withWeight=True)
#
# print(tags)
# print(",".join(tags))


def getKey(str):
    counts = {}
    for i in str:
        content = jieba.lcut(i[0])
        for word in content:
            if len(word) == 1 or word in nolist:#单个词不计算在内
                continue
            else:
                counts[word]=counts.get(word,0)+1#遍历所有词语，每出现一次其对应值加1

    items = list(counts.items())#将键值对转化为列表
    items.sort(key=lambda x:x[1], reverse=True)#根据词语出现的次数进行从大到小的排序

    for i in range(20):
        word, count = items[i]
        mysqlUtil.insert_key(word, count)
        print('{0:<5}{1:<5}'.format(word, count))

    return items


if __name__ == '__main__':
    nolist ={'are','is','am','and','of','but','so','which','where','when','how','what','that','who','whose','in','at','with','of','for','the','a','an','to','on','we','We','this','by','from','our','as','in','The','can','he','He','The','be','In'}
    res = mysqlUtil.select_ab()
    # print(res[0])
    getKey(res)

lunwenSpideer.py

# -*- coding:utf-8 -*-
import requests
import re
import json
import Mysql

headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36"
}


def getData():
    url = "https://openaccess.thecvf.com/menu"
    res = requests.get(url).text.replace('
', '').replace('<br>', '')
    # print(res)
    getA = re.compile(r'<dd>(.*?) [<a href="(.*?)">Main Conference</a>]  [<a href="(.*?)/menu.*?">Workshops</a>]</dd>')
    keyA = re.findall(getA, res)
    print("会议有"+str(len(keyA)))
    print(keyA)
    httpList = []
    httpList2 = []
    httpList3 = []
    ht = []
    h = []
    t = []
    temp = []
    for i in keyA:
        h1 = i[0]
        h2 = 'https://openaccess.thecvf.com'+i[1]
        h3 = 'https://openaccess.thecvf.com'+i[2]
        httpList.append([h1, h2, h3])  # 会议题目 链接
    # print(httpList)
    for i in httpList:
            url2 = i[2]+'/menu'
            res2 = requests.get(url2).text.replace('.py', '')
            print(url2)
            getZ = re.compile(r'<dl>(.*?)</dl>', re.DOTALL)
            keyZ = re.findall(getZ, res2)[0]
            # print(keyZ)
            getB = re.compile(r'<a href="/?(?:w+/)?(w+)">(.*?)</a><br><br>.*?</dd>', re.DOTALL)
            keyB = re.findall(getB, keyZ)  # 2
            print(keyB)
            for k in keyB:
                h1 = i[2]+'/'+k[0]
                url4 = h1
                print(h1)
                res4 = requests.get(url4).text
                getX = re.compile(r'<dt class="ptitle"><br><a href="(.*?)">')
                keyX = re.findall(getX, res4)
                for y in range(len(keyX)):
                    act1 = 'https://openaccess.thecvf.com'+keyX[y]
                    url3 = act1  # 论文链接
                    print(act1)
                    res2 = requests.get(url3).text.replace('
', '')
                    getC = re.compile(r'<meta name="citation_pdf_url" content="(.*?)">.*?<div id="abstract">(.*?)</div>.*?authors+=s+{(.*?)}.*?titles+=s+{(.*?)}.*?booktitles+=s+{(.*?)}.*?months+=s+{(.*?)}.*?years+=s+{(.*?)}', re.DOTALL)
                    keyC = re.findall(getC, res2)
                    print(keyC)
                    t1 = keyC[0][2]  # 作者
                    t2 = keyC[0][3]  # 题目
                    t3 = keyC[0][4]  # 书名
                    t4 = keyC[0][5] + ',' + keyC[0][6]  # 日期
                    t5 = keyC[0][1]  # 摘要
                    t6 = keyC[0][0]  # 链接

                    temp.append([t1, t2, t3, t4, t5, t6])
                    Mysql.insert_item(temp)
                    temp = []


if __name__ == '__main__':
    getData()