NIPS2020 论文下载 代码

  1 # %% NIPS 2020 论文信息下载
  2 import json
  3 import os
  4 import re
  5 
  6 import pandas as pd
  7 import requests
  8 import tqdm
  9 from bs4 import BeautifulSoup
 10 
 11 
 12 os.chdir(os.path.dirname(os.path.abspath(__file__)))
 13 
 14 # %%
 15 PAPER_HASH_PATTERN = re.compile(r'poster_(?P<UID>w+).html')
 16 SESSION_PATTERN = re.compile(r'Orals & Spotlights Track d+:s*(?P<session>[^;]*)')
 17 
 18 
 19 def cleanup_string(s):
 20     s = s.strip()
 21     while '  ' in s:
 22         s = s.replace('  ', ' ')
 23     return s
 24 
 25 
 26 def download_file(download_url, file_name=None):
 27     if file_name is None:
 28         file_name = os.path.basename(download_url)
 29     response = requests.get(download_url, stream=True)
 30     total = int(response.headers.get('Content-Length'))
 31     pbar = None
 32     if total is not None:
 33         pbar = tqdm.tqdm(desc=f'Downloading from {download_url} to {file_name}',
 34                          total=total, unit='B', unit_scale=True, unit_divisor=1000)
 35     with open(file_name, 'wb') as file:
 36         for chunk in response.iter_content(chunk_size=10240):
 37             if chunk:
 38                 file.write(chunk)
 39             if pbar is not None:
 40                 pbar.update(len(chunk))
 41 
 42 
 43 # %%
 44 # download paper list
 45 if not os.path.exists('papers.json'):
 46     download_file('https://neurips.cc/virtual/2020/public/papers.json', file_name='papers.json')
 47 
 48 # %%
 49 # get oral paper list
 50 oral_papers = set()
 51 response = requests.get('https://neurips.cc/virtual/2020/public/f_orals.html')
 52 soup = BeautifulSoup(response.text, 'html.parser')
 53 for tag in soup.find_all('a', href=PAPER_HASH_PATTERN):
 54     href = tag['href']
 55     UID = PAPER_HASH_PATTERN.search(href).group('UID')
 56     oral_papers.add(UID)
 57 
 58 # %%
 59 # process paper list
 60 with open('papers.json', mode='r') as file:
 61     data = json.load(file)
 62 
 63 df = pd.DataFrame(columns=['ID', 'Category', 'Title', 'Authors', 'Keywords', 'Sessions', 'URL', 'Proceedings URL', 'PDF URL', 'UID'])
 64 for i, paper in enumerate(tqdm.tqdm(data)):
 65     if paper['eventtype'] != 'Poster':
 66         continue
 67 
 68     UID = paper['UID']
 69     category = 'Poster'
 70     sessions = '; '.join(paper['sessions'])
 71     sessions = '; '.join([match.group('session') for match in SESSION_PATTERN.finditer(sessions)])
 72     sessions = cleanup_string(sessions)
 73     if sessions != '':
 74         category = 'Spotlight'
 75     if UID in oral_papers:
 76         category = 'Oral'
 77 
 78     keywords = set()
 79     for keyword in ('; '.join(paper['keywords'])).split('; '):
 80         keyword = cleanup_string(keyword)
 81         if keyword != '':
 82             keywords.add(keyword)
 83     keywords = '
'.join(sorted(keywords))
 84 
 85     paper = {
 86         'ID': paper['id'],
 87         'Category': category,
 88         'Title': cleanup_string(paper['title']),
 89         'Authors': cleanup_string(', '.join(paper['authors'])),
 90         'Keywords': keywords,
 91         'Sessions': sessions,
 92         'URL': f'https://neurips.cc/virtual/2020/public/poster_{UID}.html',
 93         'Proceedings URL': paper['paper_pdf_url'],
 94         'PDF URL': f'https://proceedings.neurips.cc/paper/2020/file/{UID}-Paper.pdf',
 95         'UID': UID
 96     }
 97     df.loc[len(df)] = paper
 98 
 99 df['Category'] = pd.Categorical(df['Category'], categories=['Oral', 'Spotlight', 'Poster'])
100 df.sort_values(by=['Category', 'Sessions', 'Keywords'], inplace=True)
101 df.to_csv('paper_list.csv', index=False)
102 
103 # %%
104 # get paper details
105 all_subject_areas = set()
106 for i, paper in enumerate(tqdm.tqdm(df.iloc, total=len(df))):
107     if paper['Keywords'] == '':
108         continue
109     areas = set(paper['Keywords'].split('
'))
110     all_subject_areas.update(areas)
111 
112 try:
113     all_subject_areas.remove('')
114 except KeyError:
115     pass
116 
117 df = df.reindex(columns=df.columns.to_list() + sorted(all_subject_areas))
118 for i, paper in enumerate(df.iloc):
119     for area in paper['Keywords'].split('
'):
120         if area != '':
121             df[area][i] = 'Y'
122 
123 df.to_csv('NeuraIPS Papers.csv', index=False)
原文地址:https://www.cnblogs.com/imoon22/p/14255581.html