hanlp入门 (含标准分词、NLP分词、索引分词、N-最短路径分词、CRF分词、极速词典分词、自定义分词)

直接给代码

 1 #-*- coding:utf-8 -*-
 2 from pyhanlp import *
 3 
 4 # 中文分词
 5 print(HanLP.segment('你好,欢迎在Python中调用HanLP的API'))
 6 print("-" * 70)
 7 
 8 print("=" * 30 + "标准分词" + "=" * 30)
 9 StandardTokenizer = JClass('com.hankcs.hanlp.tokenizer.StandardTokenizer')
10 print(StandardTokenizer.segment('你好,欢迎在Python中调用HanLP的API'))
11 print("-" * 70)
12 
13 # NLP分词NLPTokenizer会执行全部命名实体识别和词性标注
14 print("=" * 30 + "NLP分词" + "=" * 30)
15 NLPTokenizer = JClass('com.hankcs.hanlp.tokenizer.NLPTokenizer')
16 print(NLPTokenizer.segment('中国科学院计算技术研究所的宗成庆教授正在教授自然语言处理课程'))
17 print("-" * 70)
18 
19 print("=" * 30 + "索引分词" + "=" * 30)
20 IndexTokenizer = JClass('com.hankcs.hanlp.tokenizer.IndexTokenizer')
21 termList = IndexTokenizer.segment("主副食品")
22 for term in termList:
23     print(str(term) + " [" + str(term.offset) + ":" + str(term.offset + len(term.word)) + "]")
24 print("-" * 70)
25 
26 print("=" * 30 + " N-最短路径分词" + "=" * 30)
27 # CRFSegment = JClass('com.hankcs.hanlp.seg.CRF.CRFSegment')
28 # segment=CRFSegment()
29 # testCase ="今天,刘志军案的关键人物,山西女商人丁书苗在市二中院出庭受审。"
30 # print(segment.seg("你看过穆赫兰道吗"))
31 print("-" * 70)
32 
33 print("=" * 30 + " CRF分词" + "=" * 30)
34 print("-" * 70)
35 
36 print("=" * 30 + " 极速词典分词" + "=" * 30)
37 SpeedTokenizer = JClass('com.hankcs.hanlp.tokenizer.SpeedTokenizer')
38 print(NLPTokenizer.segment('江西鄱阳湖干枯,中国最大淡水湖变成大草原'))
39 print("-" * 70)
40 
41 print("=" * 30 + " 自定义分词" + "=" * 30)
42 CustomDictionary = JClass('com.hankcs.hanlp.dictionary.CustomDictionary')
43 CustomDictionary.add('攻城狮')
44 CustomDictionary.add('单身狗')
45 HanLP = JClass('com.hankcs.hanlp.HanLP')
46 print(HanLP.segment('攻城狮逆袭单身狗,迎娶白富美,走上人生巅峰'))
47 print("-" * 70)
原文地址:https://www.cnblogs.com/smartisn/p/13822711.html