DocumentSimilarity

读取文件

abstracts = [line.strip() for line in file('../DATA/AbstractData.txt')]

print abstracts[:1]
['25177 Given n non-vertical lines in 3-space, their vertical depth (above/below) relation can contain cycles. We show that the lines can be cut into O(n3/2polylog n) pieces, such that the depth relation among these pieces is now a proper partial order. This bound is nearly tight in the worst case. As a consequence, we deduce that the number of pairwise non-overlapping cycles, namely, cycles whose xy-projections do not overlap, is O(n3/2polylog n); this bound too is almost tight in the worst case. Previous results on this topic could only handle restricted cases of the problem (such as handling only triangular cycles, by Aronov, Koltun, and Sharir, or only cycles in grid-like patterns, by Chazelle et al.), and the bounds were considerably weaker&#x2014;much closer to the trivial quadratic bound. Our proof uses a recent variant of the polynomial partitioning technique, due to Guth, and some simple tools from algebraic geometry. It is much more straightforward than the previous &#x201C;purely combinatorial&#x201D; methods. Our approach extends to eliminating all cycles in the depth relation among segments, and among constant-degree algebraic arcs. We hope that a suitable extension of this technique could be used to handle the much more difficult case of pairwise-disjoint triangles as well. Our results almost completely settle a long-standing (35 years old) open problem in computational geometry, motivated by hidden-surface removal in computer graphics. </p>']

获取摘要ID

abstrctsId = [abstract.split(' ')[0] for abstract in abstracts]
print abstrctsId[:1]
['25177']

大小写转换

abstractLower = [[word for word in abstract.lower().split()] for abstract in abstracts]

print abstractLower[:1]
[['25177', 'given', 'n', 'non-vertical', 'lines', 'in', '3-space,', 'their', 'vertical', 'depth', '(above/below)', 'relation', 'can', 'contain', 'cycles.', 'we', 'show', 'that', 'the', 'lines', 'can', 'be', 'cut', 'into', 'o(n3/2polylog', 'n)', 'pieces,', 'such', 'that', 'the', 'depth', 'relation', 'among', 'these', 'pieces', 'is', 'now', 'a', 'proper', 'partial', 'order.', 'this', 'bound', 'is', 'nearly', 'tight', 'in', 'the', 'worst', 'case.', 'as', 'a', 'consequence,', 'we', 'deduce', 'that', 'the', 'number', 'of', 'pairwise', 'non-overlapping', 'cycles,', 'namely,', 'cycles', 'whose', 'xy-projections', 'do', 'not', 'overlap,', 'is', 'o(n3/2polylog', 'n);', 'this', 'bound', 'too', 'is', 'almost', 'tight', 'in', 'the', 'worst', 'case.', 'previous', 'results', 'on', 'this', 'topic', 'could', 'only', 'handle', 'restricted', 'cases', 'of', 'the', 'problem', '(such', 'as', 'handling', 'only', 'triangular', 'cycles,', 'by', 'aronov,', 'koltun,', 'and', 'sharir,', 'or', 'only', 'cycles', 'in', 'grid-like', 'patterns,', 'by', 'chazelle', 'et', 'al.),', 'and', 'the', 'bounds', 'were', 'considerably', 'weaker&#x2014;much', 'closer', 'to', 'the', 'trivial', 'quadratic', 'bound.', 'our', 'proof', 'uses', 'a', 'recent', 'variant', 'of', 'the', 'polynomial', 'partitioning', 'technique,', 'due', 'to', 'guth,', 'and', 'some', 'simple', 'tools', 'from', 'algebraic', 'geometry.', 'it', 'is', 'much', 'more', 'straightforward', 'than', 'the', 'previous', '&#x201c;purely', 'combinatorial&#x201d;', 'methods.', 'our', 'approach', 'extends', 'to', 'eliminating', 'all', 'cycles', 'in', 'the', 'depth', 'relation', 'among', 'segments,', 'and', 'among', 'constant-degree', 'algebraic', 'arcs.', 'we', 'hope', 'that', 'a', 'suitable', 'extension', 'of', 'this', 'technique', 'could', 'be', 'used', 'to', 'handle', 'the', 'much', 'more', 'difficult', 'case', 'of', 'pairwise-disjoint', 'triangles', 'as', 'well.', 'our', 'results', 'almost', 'completely', 'settle', 'a', 'long-standing', '(35', 'years', 'old)', 'open', 'problem', 'in', 'computational', 'geometry,', 'motivated', 'by', 'hidden-surface', 'removal', 'in', 'computer', 'graphics.', '</p>']]

将标点符号与单词进行分离

from nltk.tokenize import word_tokenize
abstractsTokenized = [[word.lower() for word in word_tokenize(abstract.decode('utf-8'))] for abstract in abstracts]

print abstractsTokenized[:1]
[[u'25177', u'given', u'n', u'non-vertical', u'lines', u'in', u'3-space', u',', u'their', u'vertical', u'depth', u'(', u'above/below', u')', u'relation', u'can', u'contain', u'cycles', u'.', u'we', u'show', u'that', u'the', u'lines', u'can', u'be', u'cut', u'into', u'o', u'(', u'n3/2polylog', u'n', u')', u'pieces', u',', u'such', u'that', u'the', u'depth', u'relation', u'among', u'these', u'pieces', u'is', u'now', u'a', u'proper', u'partial', u'order', u'.', u'this', u'bound', u'is', u'nearly', u'tight', u'in', u'the', u'worst', u'case', u'.', u'as', u'a', u'consequence', u',', u'we', u'deduce', u'that', u'the', u'number', u'of', u'pairwise', u'non-overlapping', u'cycles', u',', u'namely', u',', u'cycles', u'whose', u'xy-projections', u'do', u'not', u'overlap', u',', u'is', u'o', u'(', u'n3/2polylog', u'n', u')', u';', u'this', u'bound', u'too', u'is', u'almost', u'tight', u'in', u'the', u'worst', u'case', u'.', u'previous', u'results', u'on', u'this', u'topic', u'could', u'only', u'handle', u'restricted', u'cases', u'of', u'the', u'problem', u'(', u'such', u'as', u'handling', u'only', u'triangular', u'cycles', u',', u'by', u'aronov', u',', u'koltun', u',', u'and', u'sharir', u',', u'or', u'only', u'cycles', u'in', u'grid-like', u'patterns', u',', u'by', u'chazelle', u'et', u'al', u'.', u')', u',', u'and', u'the', u'bounds', u'were', u'considerably', u'weaker', u'&', u'#', u'x2014', u';', u'much', u'closer', u'to', u'the', u'trivial', u'quadratic', u'bound', u'.', u'our', u'proof', u'uses', u'a', u'recent', u'variant', u'of', u'the', u'polynomial', u'partitioning', u'technique', u',', u'due', u'to', u'guth', u',', u'and', u'some', u'simple', u'tools', u'from', u'algebraic', u'geometry', u'.', u'it', u'is', u'much', u'more', u'straightforward', u'than', u'the', u'previous', u'&', u'#', u'x201c', u';', u'purely', u'combinatorial', u'&', u'#', u'x201d', u';', u'methods', u'.', u'our', u'approach', u'extends', u'to', u'eliminating', u'all', u'cycles', u'in', u'the', u'depth', u'relation', u'among', u'segments', u',', u'and', u'among', u'constant-degree', u'algebraic', u'arcs', u'.', u'we', u'hope', u'that', u'a', u'suitable', u'extension', u'of', u'this', u'technique', u'could', u'be', u'used', u'to', u'handle', u'the', u'much', u'more', u'difficult', u'case', u'of', u'pairwise-disjoint', u'triangles', u'as', u'well', u'.', u'our', u'results', u'almost', u'completely', u'settle', u'a', u'long-standing', u'(', u'35', u'years', u'old', u')', u'open', u'problem', u'in', u'computational', u'geometry', u',', u'motivated', u'by', u'hidden-surface', u'removal', u'in', u'computer', u'graphics', u'.', u'<', u'/p', u'>']]

除去停用词

from nltk.corpus import stopwords
englishStopwords = stopwords.words('english')
print len(englishStopwords)
abstractFilterStopwords = [[word for word in abstract if not word in englishStopwords] for abstract in abstractsTokenized]
print abstractFilterStopwords[:1]

[[u'25177', u'given', u'n', u'non-vertical', u'lines', u'3-space', u',', u'vertical', u'depth', u'(', u'above/below', u')', u'relation', u'contain', u'cycles', u'.', u'show', u'lines', u'cut', u'o', u'(', u'n3/2polylog', u'n', u')', u'pieces', u',', u'depth', u'relation', u'among', u'pieces', u'proper', u'partial', u'order', u'.', u'bound', u'nearly', u'tight', u'worst', u'case', u'.', u'consequence', u',', u'deduce', u'number', u'pairwise', u'non-overlapping', u'cycles', u',', u'namely', u',', u'cycles', u'whose', u'xy-projections', u'overlap', u',', u'o', u'(', u'n3/2polylog', u'n', u')', u';', u'bound', u'almost', u'tight', u'worst', u'case', u'.', u'previous', u'results', u'topic', u'could', u'handle', u'restricted', u'cases', u'problem', u'(', u'handling', u'triangular', u'cycles', u',', u'aronov', u',', u'koltun', u',', u'sharir', u',', u'cycles', u'grid-like', u'patterns', u',', u'chazelle', u'et', u'al', u'.', u')', u',', u'bounds', u'considerably', u'weaker', u'&', u'#', u'x2014', u';', u'much', u'closer', u'trivial', u'quadratic', u'bound', u'.', u'proof', u'uses', u'recent', u'variant', u'polynomial', u'partitioning', u'technique', u',', u'due', u'guth', u',', u'simple', u'tools', u'algebraic', u'geometry', u'.', u'much', u'straightforward', u'previous', u'&', u'#', u'x201c', u';', u'purely', u'combinatorial', u'&', u'#', u'x201d', u';', u'methods', u'.', u'approach', u'extends', u'eliminating', u'cycles', u'depth', u'relation', u'among', u'segments', u',', u'among', u'constant-degree', u'algebraic', u'arcs', u'.', u'hope', u'suitable', u'extension', u'technique', u'could', u'used', u'handle', u'much', u'difficult', u'case', u'pairwise-disjoint', u'triangles', u'well', u'.', u'results', u'almost', u'completely', u'settle', u'long-standing', u'(', u'35', u'years', u'old', u')', u'open', u'problem', u'computational', u'geometry', u',', u'motivated', u'hidden-surface', u'removal', u'computer', u'graphics', u'.', u'<', u'/p', u'>']]

除去标点符号

englishPunctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%','<','>','=','{','}','+','"','-','/']
abstracts = [[word for word in abstract if not word in englishPunctuations] for abstract in abstractFilterStopwords]

print abstracts[:1]
[[u'25177', u'given', u'n', u'non-vertical', u'lines', u'3-space', u'vertical', u'depth', u'above/below', u'relation', u'contain', u'cycles', u'show', u'lines', u'cut', u'o', u'n3/2polylog', u'n', u'pieces', u'depth', u'relation', u'among', u'pieces', u'proper', u'partial', u'order', u'bound', u'nearly', u'tight', u'worst', u'case', u'consequence', u'deduce', u'number', u'pairwise', u'non-overlapping', u'cycles', u'namely', u'cycles', u'whose', u'xy-projections', u'overlap', u'o', u'n3/2polylog', u'n', u'bound', u'almost', u'tight', u'worst', u'case', u'previous', u'results', u'topic', u'could', u'handle', u'restricted', u'cases', u'problem', u'handling', u'triangular', u'cycles', u'aronov', u'koltun', u'sharir', u'cycles', u'grid-like', u'patterns', u'chazelle', u'et', u'al', u'bounds', u'considerably', u'weaker', u'x2014', u'much', u'closer', u'trivial', u'quadratic', u'bound', u'proof', u'uses', u'recent', u'variant', u'polynomial', u'partitioning', u'technique', u'due', u'guth', u'simple', u'tools', u'algebraic', u'geometry', u'much', u'straightforward', u'previous', u'x201c', u'purely', u'combinatorial', u'x201d', u'methods', u'approach', u'extends', u'eliminating', u'cycles', u'depth', u'relation', u'among', u'segments', u'among', u'constant-degree', u'algebraic', u'arcs', u'hope', u'suitable', u'extension', u'technique', u'could', u'used', u'handle', u'much', u'difficult', u'case', u'pairwise-disjoint', u'triangles', u'well', u'results', u'almost', u'completely', u'settle', u'long-standing', u'35', u'years', u'old', u'open', u'problem', u'computational', u'geometry', u'motivated', u'hidden-surface', u'removal', u'computer', u'graphics', u'/p']]

单词词干化

from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
abstractsStemed = [[st.stem(word) for word in abstract] for abstract in abstracts]
print abstractsStemed[:1]

[[u'25177', u'giv', u'n', u'non-vertical', u'lin', u'3-space', u'vert', u'dep', u'above/below', u'rel', u'contain', u'cyc', u'show', u'lin', u'cut', u'o', u'n3/2polylog', u'n', u'piec', u'dep', u'rel', u'among', u'piec', u'prop', u'part', u'ord', u'bound', u'near', u'tight', u'worst', u'cas', u'consequ', u'deduc', u'numb', u'pairw', u'non-overlapping', u'cyc', u'nam', u'cyc', u'whos', u'xy-projections', u'overlap', u'o', u'n3/2polylog', u'n', u'bound', u'almost', u'tight', u'worst', u'cas', u'prevy', u'result', u'top', u'could', u'handl', u'restrict', u'cas', u'problem', u'handl', u'triangul', u'cyc', u'aronov', u'koltun', u'sharir', u'cyc', u'grid-like', u'pattern', u'chazel', u'et', u'al', u'bound', u'consid', u'weak', u'x2014', u'much', u'clos', u'triv', u'quadr', u'bound', u'proof', u'us', u'rec', u'vary', u'polynom', u'partit', u'techn', u'due', u'guth', u'simpl', u'tool', u'algebra', u'geometry', u'much', u'straightforward', u'prevy', u'x201c', u'pur', u'combin', u'x201d', u'method', u'approach', u'extend', u'elimin', u'cyc', u'dep', u'rel', u'among', u'seg', u'among', u'constant-degree', u'algebra', u'arc', u'hop', u'suit', u'extend', u'techn', u'could', u'us', u'handl', u'much', u'difficult', u'cas', u'pairwise-disjoint', u'triangl', u'wel', u'result', u'almost', u'complet', u'settl', u'long-stand', u'35', u'year', u'old', u'op', u'problem', u'comput', u'geometry', u'mot', u'hidden-surface', u'remov', u'comput', u'graph', u'/p']]

去除低频词

allStem = sum(abstractsStemed,[])
#找到出现频率为1的词干
stemOnce = [stem for stem in set(allStem) if allStem.count(stem) == 1]
abstractOver = [[for word in abstract if not word in stemOnce] for abstract in abstractsStemed]
print abstractOver[:1]

计算每个摘要中的词出现的次数

from gensim import corpora, models, similarities
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
abstractDict = corpora.Dictionary(listData)

#[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)], [(0, 1), (4, 1), (5, 1), (7, 1), (8, 1), (9, 2), (10, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (10, 1)]]
#(0,1)代表第一篇文档的第一个词 在整个corpus中出现了一次
corpus = [abstractDict.doc2bow(text) for text in listData]

通过频率计算TF-IDF

tfidf = models.TfidfModel(corpus=corpus)

训练lsi模型

#使用TF-IDF模型将先前的使用词频表示文档转变成使用tf-idf表示文档向量
corpusTfidf = tfidf[corpus]
#训练文档向量组成的矩阵SVD分解 将文档向量映射到指定的维度的空间中
lsi = models.LsiModel(corpusTfidf,id2word=abstractDict, num_topics=65)

建立索引

index = similarities.MatrixSimilarity(lsi[corpus])

进行相似度计算

test = listData[0]
testBow = abstractDict.doc2bow(test)
testLsi = lsi[testBow]
sims = index[testLsi]
sortSims = sorted(enumerate(sims), key=lambda item:-item[1])

result

[(0, 1.0), (71593, 0.91843772), (14261, 0.90106153), (71746, 0.89864981), (10002, 0.88990259), (9620, 0.88964564), (9336, 0.88943446), (4707, 0.87609327), (5138, 0.87609327), (3973, 0.87609315)]

For循环执行进度条

import sys, time

class ShowProcess():
    """
    显示处理进度的类
    调用该类相关函数即可实现处理进度的显示
    """
    i = 1 # 当前的处理进度
    max_steps = 0 # 总共需要处理的次数
    max_arrow = 50 #进度条的长度

    # 初始化函数，需要知道总共的处理次数
    def __init__(self, max_steps):
        self.max_steps = max_steps
        self.i = 0

    # 显示函数，根据当前的处理进度i显示进度
    # 效果为[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>]100.00%
    def show_process(self, i=None):
        if i is not None:
            self.i = i
        num_arrow = int(self.i * self.max_arrow / self.max_steps) #计算显示多少个'>'
        num_line = self.max_arrow - num_arrow #计算显示多少个'-'
        percent = self.i * 100.0 / self.max_steps #计算完成进度，格式为xx.xx%
        process_bar = '[' + '>' * num_arrow + '-' * num_line + ']'
                      + '%.2f' % percent + '%' + '
' #带输出的字符串，'
'表示不换行回到最左边
        sys.stdout.write(process_bar) #这两句打印字符到终端
        sys.stdout.flush()
        self.i += 1

    def close(self, words='done'):
        print ('')
        print (words)
        self.i = 1

max_steps = 1000
process_bar = ShowProcess(max_steps)
for text,job in jobs:
    n = n + 1
    process_bar.show_process()
    document1 = gensim.models.doc2vec.TaggedDocument(words=job(), tags=job()[0])  
    corpora_documents1.append(document1)
    if n == 1000:
        break
process_bar.close()

使用python的 parallel python（pp）实现并行计算

def AbstractTowordList(text):
    import nltk
    from nltk.tokenize import word_tokenize
    englishPunctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%','<','>','=','{','}','+','"','-','/','/p']
    lower = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
    tokenize1 = [word.lower() for word in word_tokenize(text.decode('utf-8'))]
    filterStopwords = [word for word in tokenize1 if not word in nltk.corpus.stopwords.words('english') if not word in englishPunctuations if not word in lower] 
     #end = [word for word in filterStopwords if not word in englishPunctuations]
    return filterStopwords

import time
import pp

ppservers = ()
job_server = pp.Server(4, ppservers=ppservers)
start = time.time()
result = []
#读入数据
texts = [abst.strip() for abst in file('../DATA/AbstractData.txt')]
jobs = [(text, job_server.submit(AbstractTowordList,(text,),(),())) for text in texts ]
print time.time() - start
job_server.print_stats()

for text，job in jobs:

　　print job()

就可以输出函数的每一个返回结果