<Think Python>中统计文献单词的处理代码

def process_line(line, hist):
    """Adds the words in the line to the histogram.

    Modifies hist.

    line: string
    hist: histogram (map from word to frequency)
    """
    # replace hyphens with spaces before splitting
    line = line.replace('-', ' ')

    for word in line.split():
        # remove punctuation and convert to lowercase
        word = word.strip(string.punctuation + string.whitespace)    # 单词的分割要找到其本质特点，其首尾必然是字母（中部可能出现标点，如isn't）
        word = word.lower()

        # update the histogram
        hist[word] = hist.get(word, 0) + 1