【NLP CODE】基于词典的中文分词算法1：最大匹配法

REF：https://zhuanlan.zhihu.com/p/103392455

# 正向最大匹配法（FMM，Forward Maximum Matching）

# 反向最大匹配法（BMM, Backward Maximum Matching）

# 双向最大匹配法首先看两种方法结果的分词数，分词数越少越好；分词数相同的情况下，看单个词的数量，越少越好

FMM.py

def FMM_func(user_dict, sentence):
    """
    正向最大匹配（FMM）
    :param user_dict: 词典
    :param sentence: 句子
    """
    # 词典中最长词长度
    max_len = max([len(item) for item in user_dict])
    start = 0
    while start != len(sentence):
        index = start + max_len
        if index>len(sentence):
            index = len(sentence)
        for i in range(max_len):
            if (sentence[start:index] in user_dict) or (len(sentence[start:index])==1):
                print(sentence[start:index], end='/')
                start = index
                break
            index += -1

BMM.py

def BMM_func(user_dict, sentence):
    """
    反向最大匹配（BMM）
    :param user_dict:词典
    :param sentence:句子
    """
    
    max_len = max([len(item) for item in user_dict]) # 词典中最长词长度
    result = []
    start = len(sentence)
    #print("start:"+str(start))
    #print("max_len:"+str(max_len))
    while start != 0:
        index = start - max_len
        #print("start:"+str(start))
        #print("max_len:"+str(max_len))
        #print("index:"+str(index))
        if index < 0: # 如果最大词出了界线，从边界开始算
            index = 0
            #break
        for i in range(max_len):
            if (sentence[index:start] in user_dict) or (len(sentence[start:index])==1):
                result.append(sentence[index:start])
                start = index
                break
            index += 1

    for i in result[::-1]:
        print(i, end='/')

TestMaxWords1.py

# https://zhuanlan.zhihu.com/p/103392455
# 正向最大匹配法（FMM，Forward Maximum Matching）
# 反向最大匹配法（BMM, Backward Maximum Matching）
# 双向最大匹配法 首先看两种方法结果的分词数，分词数越少越好；分词数相同的情况下，看单个词的数量，越少越好

from FMM import FMM_func
from BMM import BMM_func

sentence = '我们在野生动物园玩'
user_dict = ['我们', '在', '在野', '生动', '野生', '动物园', '野生动物园', '园','玩']

FMM_func(user_dict, sentence)
print()
BMM_func(user_dict, sentence)

# 如：“我们在野生动物园玩”
# 正向最大匹配法，最终切分结果为：“我们/在野/生动/物/园/玩”，其中，两字词3个，单字字典词为2，非词典词为1（'物'不在词典）。
# 逆向最大匹配法，最终切分结果为：“我们/在/野生动物园/玩”，其中，五字词1个，两字词1个，单字字典词为2，非词典词为0。
# 非字典词：正向(1)>逆向(0)（越少越好）
# 单字字典词：正向(2)=逆向(2)（越少越好）
# 总词数：正向(6)>逆向(4)（越少越好）
# 因此最终输出为：逆向结果。逆向结果是正确的。

TestMaxWords2.py

# https://www.cnblogs.com/dahuang123/p/11990651.html


from FMM import FMM_func
from BMM import BMM_func

sentence = '他是研究生物化学的'
user_dict = ['他','是','研究','研究生','生物','物化','化学','生物化学','学','的']

FMM_func(user_dict, sentence)
print()
BMM_func(user_dict, sentence)

# 他/是/研究生/物化/学/的/
# 他/是/研究/生物化学/的/
# 非字典词：正向(0)=逆向(0)（越少越好）
# 单字字典词：正向(4)>逆向(3)（越少越好）
# 总词数：正向(6)>逆向(5)（越少越好）
# 因此最终输出为：逆向结果。逆向结果是正确的。

TestMaxWords3.py

from FMM import FMM_func
from BMM import BMM_func

sentence = '商品和服务'
user_dict = ['商品','和','和服','服务']

FMM_func(user_dict, sentence)
print()
BMM_func(user_dict, sentence)

# 商品/和服/务/
# 商品/和/服务/
# 非字典词：正向(0)=逆向(0)（越少越好）
# 单字字典词：正向(1)=逆向(1)（越少越好）
# 总词数：正向(3)>逆向(3)（越少越好）
# 因此最终输出为：不确定。逆向结果是正确的。

TestMaxWords4.py

from FMM import FMM_func
from BMM import BMM_func

sentence = '南京市长江大桥'
user_dict = ['南京','南京市','市','市长','长江','大桥','江大桥']

FMM_func(user_dict, sentence)
print()
BMM_func(user_dict, sentence)

# 南京市/长江/大桥/
# 南京/市长/江大桥/
# 非字典词：正向(0)=逆向(0)（越少越好）
# 单字字典词：正向(0)=逆向(0)（越少越好）
# 总词数：正向(3)=逆向(3)（越少越好）
# 因此最终输出为：不确定。正向结果是正确的。

TestMaxWords5.py

from FMM import FMM_func
from BMM import BMM_func

sentence = '下雨天留客天留我不留'
user_dict = ['下','雨','下雨','雨天','天','留客','我','不','不留','留']

FMM_func(user_dict, sentence)
print()
BMM_func(user_dict, sentence)

输出：

下雨/天/留客/天/留/我/不留/
下/雨天/留客/天/留/我/不留/