nltk_28Twitter情感分析模型

sklearn实战-乳腺癌细胞数据挖掘(博客主亲自录制视频教程)

https://study.163.com/course/introduction.htm?courseId=1005269003&utm_campaign=commission&utm_source=cp-400000000398149&utm_medium=share

生产Twitter情感分析的模型，并保存数据为pickle，此过程可能要一个小时，所以下次调用数据就很简单了

# -*- coding: utf-8 -*-
"""
Created on Thu Jan 12 10:44:19 2017

@author: Administrator

用于短评论分析-- Twitter

保存后的"positive.txt"，"negative.txt"需要转码为utf-8
在线转码网址
http://www.esk365.com/tools/GB2312-UTF8.asp


features=5000,准确率百分之60以上
features=10000，准确率百分之 以上

运行时间可能长达一个小时
"""

import nltk
import random
import pickle
from nltk.tokenize import word_tokenize
        
short_pos = open("positive.txt","r").read()
short_neg = open("negative.txt","r").read()

# move this up here
documents = []
all_words = []

for r in short_pos.split('
'):
    documents.append( (r, "pos") )

for r in short_neg.split('
'):
    documents.append( (r, "neg") )


#  j is adject, r is adverb, and v is verb
#allowed_word_types = ["J","R","V"] 允许形容词类别
allowed_word_types = ["J"]


for p in short_pos.split('
'):
    documents.append( (p, "pos") )
    words = word_tokenize(p)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())


for p in short_neg.split('
'):
    documents.append( (p, "neg") )
    words = word_tokenize(p)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())

#保存文档            
save_documents = open("pickled_algos/documents.pickle","wb")
pickle.dump(documents, save_documents)
save_documents.close()           


#保存特征
all_words = nltk.FreqDist(all_words)
#最好改成2万以上
word_features = list(all_words.keys())[:5000]
save_word_features = open("pickled_algos/word_features5k.pickle","wb")
pickle.dump(word_features, save_word_features)
save_word_features.close()


def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

featuresets = [(find_features(rev), category) for (rev, category) in documents]

random.shuffle(featuresets)
print(len(featuresets))

testing_set = featuresets[10000:]
training_set = featuresets[:10000]


classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

#保存分类器
save_classifier = open("pickled_algos/originalnaivebayes5k.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

 sentiment_mod.py

# -*- coding: utf-8 -*-
"""
Created on Thu Jan 12 16:47:51 2017

@author: Administrator
"""

#File: sentiment_mod.py

import nltk
import random
import pickle
from nltk.tokenize import word_tokenize

documents_f = open("pickled_algos/documents.pickle", "rb")
documents = pickle.load(documents_f)
documents_f.close()


word_features5k_f = open("pickled_algos/word_features5k.pickle", "rb")
word_features = pickle.load(word_features5k_f)
word_features5k_f.close()


def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features


featuresets_f = open("pickled_algos/featuresets.pickle", "rb")
featuresets = pickle.load(featuresets_f)
featuresets_f.close()

random.shuffle(featuresets)
print(len(featuresets))

testing_set = featuresets[10000:]
training_set = featuresets[:10000]


open_file = open("pickled_algos/originalnaivebayes5k.pickle", "rb")
classifier = pickle.load(open_file)
open_file.close()



def sentiment(text):
    feats = find_features(text)
    return classifier.classify(feats)

测试

# -*- coding: utf-8 -*-
"""
Created on Thu Jan 12 16:50:12 2017

@author: Administrator
"""

import sentiment_mod as s

print(s.sentiment("This movie was awesome! The acting was great, plot was wonderful, and there were pythons...so yea!"))
print(s.sentiment("This movie was utter junk. There were absolutely 0 pythons. I don't see what the point was at all. Horrible movie, 0/10"))

python风控评分卡建模和风控常识(博客主亲自录制视频教程)

https://study.163.com/course/introduction.htm?courseId=1005214003&utm_campaign=commission&utm_source=cp-400000000398149&utm_medium=share