向量空间模型实现文档查询(Vector Space Model to realize document query)


xml中文档(query)的结构:

<topic>
<number>CIRB010TopicZH006</number>
<title>科索沃難民潮</title>
<question>
查詢科索沃戰爭中的難民潮情況,以及國際間對其采取的援助。
</question>
<narrative>
相關文件內容包含科省難民湧入的地點、人數。受安置的狀況,難民潮引發的問題,参與救援之國家與國際組織,其援助策略與行動內容之報導。
</narrative>
<concepts>
科省、柯省、科索沃、柯索伏、難民、難民潮、難民營、援助、收容、救援、醫療、人道、避難、馬其頓、土耳其、外交部、國際、聯合國、紅十字會、阿爾巴尼亞裔難民。
</concepts>
</topic>

文档列表的样子(file-list)

CIRB010/cdn/loc/CDN_LOC_0001457
CIRB010/cdn/loc/CDN_LOC_0000294
CIRB010/cdn/loc/CDN_LOC_0000120
CIRB010/cdn/loc/CDN_LOC_0000661
CIRB010/cdn/loc/CDN_LOC_0001347
CIRB010/cdn/loc/CDN_LOC_0000439


词库的样子(vocab.all) 中文的话是单个字一行

utf8
Copper
version
EGCG
432Kbps
RESERVECHARDONNAY
TommyHolloway
platts
Celeron266MHz
VOLKSWAGEN
INDEX
SmarTone


倒排文档的表示(inverted-file)

词库中词的行号1  词库中词的行号2(-1表示单个词,仅仅考虑1)文档个数

文档在列表中的行数 词出现的次数

1 -1 2
33689 1
38365 1
2 -1 1
33256 1
2 12371 1
33256 1
3 -1 1
10849 2
3 6756 1

代码实现 仅仅是考虑单个的字


# -*- coding: utf-8 -*-
#!usr/bin/python

import sys
import getopt
from xml.dom.minidom import parse
import xml.dom.minidom
import scipy.sparse as sp
from numpy import *
from math import log
from sklearn.preprocessing import normalize

#deal with the argv
def main(argv):
	ifFeedback=False
	try:
		opts,args=getopt.getopt(argv,'ri:o:m:d:',[])
	except getopt.GetoptError:
		# run input
		print 'wrong input'
	for opt,arg in opts:
		if opt=='-r' and ifFeedback==False:
			ifFeedback=True
		elif opt=='-i':
			queryFile=arg
		elif opt=='-o':
			rankedList=arg
		elif opt=='-m':
			modelDir=arg
		elif opt=='-d':
			NTCIRDir=arg
		else:
			pass
	return ifFeedback,queryFile,rankedList,modelDir,NTCIRDir		
#if __name__=='__main__' :


#get the path in the arguments
ifFeedback,queryFile,rankedList,modelDir,NTCIRDir=main(sys.argv[1:])
#print ifFeedback,queryFile,rankedList,modelDir,NTCIRDir

#get the file path in the model-dir
vocab=modelDir+'/vocab.all'
fileList=modelDir+'/file-list'
invList=modelDir+'/inverted-file'

#read
pf=open(vocab,'r')
vocab=pf.read()
pf.close()

pf=open(fileList,'r')
fileList=pf.read()
pf.close()

pf=open(invList,'r')
invList=pf.read()
pf.close()

#splitlines
vocab=vocab.splitlines();
fileList=fileList.splitlines()
invList=invList.splitlines()

# vocab dict
vocabDict={}
k=0
while k <len(vocab):
	vocabDict[vocab[k]]=k
	k+=1


#get the TF and IDF matrix
#dimension:
#tfMatrix=sp.csr_matrix(len(fileList),len(vocab))

IDFVector=zeros(len(vocab))
totalDocs=len(fileList)
count=0
tempMatrix=zeros((len(fileList),len(vocab)))

while count<len(invList):
	postings=invList[count]
	post=postings.split(' ')
	k=1
	#just deal with the single word
	if(len(post)>2 and post[1]=='-1'):
		IDFVector[int(post[0])]=int(post[2])
		while k<=int(post[2]):
			line=invList[count+k].split(' ')
			tempMatrix[int(line[0])][int(post[0])]=int(line[1])
			k+=1
	count+=k

tfMatrix=sp.csr_matrix(tempMatrix)

#BM25
doclens=tfMatrix.sum(1)
avglen=doclens.mean()
k=7
b=0.7
#
tp1=tfMatrix*(k+1)
tp2=k*(1-b+b*doclens/avglen)
tfMatrix.data+=array(tp2[tfMatrix.tocoo().row]).reshape(len(tfMatrix.data))
tfMatrix.data=tp1.data/tfMatrix.data

#calculate the idf
k=0
while k<len(vocab):
	if IDFVector[k]!=0:
		IDFVector[k]=log(float(totalDocs)/IDFVector[k])
	k+=1
#tf-idf
tfMatrix.data*=IDFVector[tfMatrix.indices]

#row normalization for tf-idf matrix
normalize(tfMatrix,norm='l2',axis=1,copy=False)

#deal with the query
doc=xml.dom.minidom.parse(queryFile)
root=doc.documentElement
topics=root.getElementsByTagName('topic')
rankList=''
for topic in topics:
	#query vector
	qVector=zeros(len(vocab))

	number=topic.getElementsByTagName('number')[0].childNodes[0].data
	title=topic.getElementsByTagName('title')[0].childNodes[0].data

	question=topic.getElementsByTagName('question')[0].childNodes[0].data
	narrative=topic.getElementsByTagName('narrative')[0].childNodes[0].data
	concepts=topic.getElementsByTagName('concepts')[0].childNodes[0].data
	
        narrative+=question+concepts
	for w in narrative:
		if vocabDict.has_key(w.encode('utf8')):
			qVector[vocabDict[w.encode('utf8')]]+=1
	for w in title:
		if vocabDict.has_key(w.encode('utf8')):
			qVector[vocabDict[w.encode('utf8')]]+=1
#...normalization
	normalize(qVector,norm='l2',axis=1,copy=False)	
	#similarity compute:
	#a sparse matrix
	sim=tfMatrix*(sp.csr_matrix(qVector).transpose())

	sim=sim.toarray()
	k=0
	simCount=[]
	while k<len(fileList):
		tup=(sim[k],k)
		simCount.append(tup)
		k+=1

	#sort
	simCount.sort(reverse=True)
	simCount=simCount[:100]
	if ifFeedback:
		topk=[]
		for score,k in simCount[:20]:
			topk.append(k)
		d=tfMatrix[topk,:].sum(0)/20
		qVector+=array(0.8*d).reshape(len(qVector))
	#.....
	normalize(qVector,norm='l2',axis=1,copy=False)	
	#similarity compute:
	#a sparse matrix
	sim=tfMatrix*(sp.csr_matrix(qVector).transpose())

	sim=sim.toarray()
	k=0
	simCount=[]
	while k<len(fileList):
		tup=(sim[k],k)
		simCount.append(tup)
		k+=1

	#sort
	simCount.sort(reverse=True)
	simCount=simCount[:100]
	#.....

	num=number.split('ZH')
	num=num[1]
	for sim in simCount:
		name=fileList[sim[1]]
		name=name.split('/')
		name=name[3].lower()		
		rank=num+' '+name
		rankList+=rank+'
'

pf=open(rankedList,'w')
pf.write(rankList)





原文地址:https://www.cnblogs.com/yjbjingcha/p/7340590.html