结巴分词

#!coding: utf-8

import sys
import jieba
import jieba.posseg as pseg
import jieba.analyse as anal
from optparse import OptionParser

usage = "usage: python %prog [--tfidf topK] [--textr topK]";
parser = OptionParser(usage);
parser.add_option("--tag", dest="tag", action="store_true");
parser.add_option("--fast", dest="fast", action="store_true");
parser.add_option("--tfidf", dest="tfidf");
parser.add_option("--textr", dest="textr");
parser.add_option("--stopdict", dest="stopdict");
opt,args = parser.parse_args();

def wordFilter(wordlist):
    if opt.stopdict:
        with open(opt.stopdict,"r") as f:
            stopList = f.read().strip().split("
");
    else:
        print "please special stopword file path";

    returnlist = [];

    for word in wordlist:
        if word:
            word = word.encode("utf-8");
            if word not in stopList:
                returnlist.append(word);

    return returnlist;



def wordPosFilter(wordlist):
    if opt.stopdict:
        with open(opt.stopdict,"r") as f:
            stopList = f.read().strip().split("
");
    else:
        stopList = [];

    returnlist = [];
    save_post = ["an","n","nr","ns","nt","nz","v","vd","eng","ni"];

    for w in wordlist:
        word = w.word.encode("utf-8");
        pos = w.flag;
        if word not in stopList and pos in save_post:
            returnlist.append(word);

    return returnlist;

txt = "支持三种分词模式： 精确模式，试图将句子最精确地切开，适合文本分析； 
全模式，把句子中所有的可以成词的词语都扫描出来, 速度非常快，但是不能解决歧义； 
搜索引擎模式，在精确模式的基础上，对长词再次切分，提高召回率，适合用于搜索引擎分词。  
支持繁体分词 支持自定义词典 MIT 授权协议 在线演示";


#multiprocess
if opt.fast:
    jieba.enable_parallel(10);



#define word-dict
jieba.add_word("全模式");
jieba.suggest_freq(("协","议"), True) ;

#jieba.load_userdict(dictfilepath);

#generator
#print "/".join(jieba.cut(txt));

with open("dict/stopword.txt") as f:
    stoplist = f.read().strip().split("
");


rest = jieba.lcut(txt);

print "/".join(rest);
print "=========================  filter ===========================";
rest = wordFilter(rest);
print "/".join(rest);


psss = pseg.lcut(txt);

print "=========================  posFilter  ===========================";
psss = wordPosFilter(psss);
print "/".join(psss);


sys.exit();

#list
print "/".join(jieba.lcut(txt));


#search mode
print "/".join(jieba.cut_for_search(txt));


#get word's position
res = jieba.tokenize(txt.decode("utf-8"));
#res = jieba.tokenize(txt.decode("utf-8"), mode="search"); #search mode
print "word		start		end";
for tk in res:
    print("%s		 %d 		 %d" % (tk[0],tk[1],tk[2]));



#tagging word
if opt.tag:
    for w,k in pseg.cut(txt):
        print w+"("+k+")",



#tfidf sort keyword
if opt.tfidf:
    topK =  int(opt.tfidf);
    tags = anal.extract_tags(txt, topK, withWeight=True);

    for word,weight in tags:
        print word,weight



#textrank sort keyword
if opt.textr:
    topk = int(opt.textr);
    tags = anal.textrank(txt, topk, withWeight=True);

    for word,weight in tags:
        print word,weight;