单词纠错系统

单词纠错

vocab = set([line.rstrip() for line in open('/content/drive/My Drive/data/vocab_data/vocab.txt')])

需要生成所有候选集合

def generate_candidates(word):
  """
  word: 给定的输入(错误的输入)
  返回所有(valid)候选集合
  """
  # 生成编辑距离为1的单词
  # 1. insert 2. delete 3. replace
  # app: replace: bpp1, cpp1, app1, abp1....
  #      insert: bappl, cappl, abppl, acppl....
  #      delete: ppl, apl, app....
  # 假设使用26个字符
  letters = 'abcderghigklmnopqrstuvwxyz'

  splits = [(word[:i], word[i:]) for i in range(len(word)+1)]

  # insert copteration
  inserts = [L+c+R for L,R in splits for c in letters]
  # delete opteration
  deletes = [L+R[1:] for L,R in splits if R]
  # repalce opteration
  repalces =[L+c+R[1:] for L,R in splits if R for c in letters]
    
  # 生成的所有候选单词
  candidates =  set(inserts+deletes+repalces)

  # 过滤掉不存在与词典库里面的单词
  return [word for word in candidates if word in vocab]

generate_candidates("apple")

读取语料库

import nltk
nltk.download('reuters')
nltk.download('punkt')
from nltk.corpus import reuters
categories = reuters.categories()
corpus = reuters.sents(categories=categories)

构建语言模型:bigram

term_count = {}
bigram_count = {}
for doc in corpus:
  doc = ['<s>'] + doc
  # bigram; [i, i+1]
  for i in range(0,len(doc)-1):
    term = doc[i]
    bigram = doc[i:i+2]

    if term in term_count:
      term_count[term]+=1
    else:
      term_count[term]=1

    bigram = ''.join(bigram)
    if bigram in bigram_count:
      bigram_count[bigram]+=1
    else:
      bigram_count[bigram]=1

用户打错的概率

channel_prob = {}

for line in open('/content/drive/My Drive/data/vocab_data/spell-errors.txt'):
  items = line.split(":")
  correct = items[0].strip()
  mistakes =[item.strip() for item in items[1].strip().split(",")]
  channel_prob[correct] = {}
  for mis in mistakes:
    channel_prob[correct][mis]=1.0/len(mistakes)

import numpy as np

V = len(term_count.keys())

file = open('/content/drive/My Drive/data/vocab_data/testdata.txt')
for line in file:
  items = line.rstrip().split('	')
  line = items[2].split()
  # ["I","loke","palying"]
  for word in line:
    if word not in vocab:
      # 需要替换word成正确的单词
      # Step1: 生成所有的valid候选集合
      candidates = generate_candidates(word=word)

      if len(candidates) < 1:
        continue  # 不建议这么做(这是不对的)
        # TODO : 根据条件生成更多的集合条件
      probs = []
      # 对于每一恶搞candidate,计算他的score
      # score = p(correct)*p(misktake|correct)
      #       = log p(correct)*p(misktake|correct)
      # 返回 score最大的candiate
      for candi in candidates:
        prob = []
        # a. 计算channelprobability
        if candi in channel_prob and word in channel_prob[candi]:
          prob += np.log(channel_prob[candi][word])
        else:
          prob += np.log(0.0001)

        # b. 计算语言模型的概率,开始有个s的符号
        idx = items[2].index(word)+1

        if (items[2][idx-1] in bigram_count and candi in bigram_count[items[2]]):
          prob += np.log((bigram_count[item[2][idx-1]][candi]+1.0)/(
              term_count[bigram_count[items[2][idx-1]]]+V))
        else:
          prob += np.log(1.0/V)

        probs.append(prob)
      max_idx = probs.index(max(probs))
      print(word,candidates[max_idx])

后续未完待续.....