bayes

from numpy import *

import time
starttime = time.time()


def loadDataSet(): 
    postingList = [['my', 'dog', 'has', 'flea',
                    'problems', 'help', 'please'],
                    ['maybe', 'not', 'take', 'him',
                    'to', 'dog', 'park', 'stupid'],
                    ['my', 'dalmation', 'is', 'so', 'cute',
                    'I', 'love', 'him'],
                    ['stop', 'posting', 'stupid', 'worthless', 
                    'garbage'],
                    ['mr', 'licks', 'ate', 'my', 'steak', 'how',
                    'to', 'stop', 'him'],
                    ['quit', 'buying', 'worthless', 'dog', 'food',
                    'stupid']]
    classVec = [0, 1, 0, 1, 0, 1] 
    return postingList, classVec

def createVocabList(dataSet): # dataSet = postingList 
    vocabSet = set([]) # vocabSet = set(dataSet)
    for document in dataSet:
        vocabSet = vocabSet | set(document) # 
    return list(vocabSet) # createVocabList = list(set(dataSet)) 

def setOfWords2Vec(vocabList, inputSet): 
    returnVec = [0] * len(vocabList) # [0, 0 , 0 ,0,..] len(vocabList)  0
    for word in vocabList:
        if word in inputSet:
            returnVec[vocabList.index(word)] = 1 + 1.0
        else:
            returnVec[vocabList.index(word)] = 1.0
            print "the word: %s is not in my Vocabulary!" % word
    return returnVec 




def txt2trainxy(filename1, filename2):
    import re
    reg = re.compile(r'W*') #
    # step 1: loading data...
    print "stet 1: loading data..."
    from os import listdir
    ld1 = listdir('email/' + filename1); ld2 = listdir('email/' + filename2)
    filelist = ld1 + ld2
    trainy = ((filename1 + '	') * len(ld1) + (filename2 + '	') * len(ld2)).split()
    
    trainx = []; fulltext = []; i = 0
    for File in filelist:
        if i < len(ld1):
            fr = reg.split(open('email/' + filename1 + '/' + File).readlines()[0].lower())
        else:
            fr = reg.split(open('email/' + filename2 + '/' + File).readlines()[0].lower())
        trainx.append([f for f in fr if len(f) > 2]) #
        fulltext.extend([f for f in fr if len(f) > 2]) #
        i += 1
    fulltext = list(set(fulltext))
    # set of words
    trainxws = [[list(set(item)).count(strg) + 1.0 for strg in fulltext] for item in trainx]
    # bag of words 
    trainxwb = [[item.count(strg) + 1.0 for strg in fulltext] for item in trainx]

    return trainxws, trainxwb, trainy, trainx, fulltext

def testx2vec(testx, fulltext):
    # set of words
    testxws = [list(set(testx)).count(strg) + 1.0 for strg in fulltext] #
    # bag of words 
    testxwb = [testx.count(strg) + 1.0 for strg in fulltext] #
    for word in testx:
        if word not in fulltext:
            print "the word: %s is not in my fulltext!" % word
    return testxws, testxwb

def bayes(testx, trainx, trainy, fulltext):
    print "---Getting Prob..."
    s = set(trainy); l = len(trainy); r = len(trainx[0])
    IDs = [[id for id in range(l) if trainy[id] == item] for item in s]
    logproby = [log(array(trainy.count(item)) / float(l)) for item in s]
    numbxv = [sum([trainx[id] for id in ids], 0) for ids in IDs]
    numbx = [sum([trainx[id] for id in ids]) + 2.0 for ids in IDs] #
    probx = [numbxv[i] / float(numbx[i]) for i in range(len(s))]
    logprobx = [[log(p[i]) for i in range(r)] for p in probx]
    print "---Printing Prob..."
    #print probx
    print [fulltext[i] for i in (-array(probx)).argsort()[:,: 5][0]] # argsort() small to big
    print trainy[IDs[0][0]]
    print [fulltext[i] for i in (-array(probx)).argsort()[:,: 5][1]]
    print trainy[IDs[1][0]]
    """
    print IDs
    print numbxv
    print logprobx
    """

    # step 4: showing the result...
    print "---Showing the result..."
    # set of words
    sumlogpxws = sum(array(logprobx) * testx, 1)
    sumlogpxyws = array(sumlogpxws) + array(logproby)
    #print logprobx
    print sumlogpxws
    print sum(array(probx) * testx, 1)
    bestyws = trainy[IDs[sumlogpxyws.argmax()][0]]
    print "---From set of words: ", bestyws
    """
    # bag of words
    sumlogpxwb = sum(array(logprobx) * testxwb, 1)
    sumlogpxywb = array(sumlogpxwb) + array(logproby)
    bestywb = trainy[IDs[sumlogpxywb.argmax()][0]]
    print "---From bag of words: ", bestywb
    """
    return bestyws
    

def main():
    # step 1: loading data...
    trainxws, trainxwb, trainy, trainx, fulltext = txt2trainxy('spam','ham')
    print fulltext

    # step 2: training...
    print "step 2: training..."
    pass

    # step 3: testing...
    print "step 3: testing..."
    print "---Preparing testdata..."
    import random
    l = len(trainy)
    testid = random.sample(range(l), 20)
    testxxx = [trainxws[i] for i in testid]
    testyyy = [trainy[i] for i in testid]
    testtrainxws = [trainxws[i] for i in range(l) if i not in testid]
    testtrainy = [trainy[i] for i in range(l) if i not in testid]
    print "---Testing now..."
    errorcount = 0; p = len(testid)
    for i in range(p):
        if bayes(testxxx[i], testtrainxws, testtrainy, fulltext) != testyyy[i]:
            errorcount += 1
    print errorcount
    print p
    print "---Errorrate is: ", (errorcount / float(p))


    # step 4: showing the result
    print "step 4: using..."
    testx = ['love', 'my', 'dalmation']
    print "the testx is: ", testx
    print "---Changing testx into vector..."
    testxws, testxwb = testx2vec(testx, fulltext)
    #print testxws
    bayes(testxws, testtrainxws, testtrainy, fulltext)

main()


"""
trainx, trainy = loadDataSet()
fulltext = createVocabList(trainx)
print fulltext
print setOfWords2Vec(fulltext, trainx[0])
trainxws = []
for t in trainx:
    trainxws.append(setOfWords2Vec(fulltext, t))
testEntry1 = ['love', 'my', 'dalmation']
testEntry2 = ['stupid', 'garbage']
bayes(testEntry1, trainxws, trainy, fulltext)

"""