pymmseg

这个库还是不太好,分词有点问题,还会有乱码
#! /usr/bin/env python
#coding=utf-8
#import sys
#print sys.path
from pymmseg import mmseg
mmseg.dict_load_defaults()
import chardet
import redis
r=redis.Redis(host='10.3.11.178',port=6379,db=1)
#r['foo']='bar'
#print r.get('foo')
#print r.type("foo")
#r.rpush("aa","cc")
#print r.type("aa")
#print r.lindex('aa',0)
#print r.rpush("aa","dd")
#print r.lindex('aa',1)
#print "-------------"
#print len(r.lrange('aa',0,-1))
#print r.lrange('aa',0,-1)
print r.lrange('00000001',0,0)
a=r.lrange('00000001',0,0)[0]
print a
algor = mmseg.Algorithm(a)
for tok in algor:
    print "--"
    print chardet.detect(tok.text)
    print '%s [%d..%d]' % (tok.text, tok.start, tok.end)

结果如下:
> "C:\Python25\pythonw.exe"  "F:\dm_app\tag2\test\redis_test.py"
['\xc0\xf1\xba\xd0\xb0\xfc\xd7\xb0']
礼盒包装
--
{'confidence': 0.0, 'encoding': None}
礼 [0..2]
--
{'confidence': 0.505, 'encoding': 'utf-8'}
邪 [3..5]
--
{'confidence': 0.98999999999999999, 'encoding': 'GB2312'}
装 [6..8]

你看多出个邪来了,whath's the hell?



原文地址:https://www.cnblogs.com/lexus/p/1695460.html