def textParse(bigString):
? ? import re
? ? listOfTokens = re.split(r'\W*', bigString)
? ? return [tok.lower() for tok in listOfTokens if len(tok) > 2]
def spamTest():
? ? docList = []; classList = [];fullText =? []
? ? for i in range(1,26):
? ? ? ? wordList = textParse(open('email/spam/%d.txt' % i).read())
? ? ? ? docList.append(wordList)
? ? ? ? fullText.extend(wordList)
? ? ? ? classList.append(1)
? ? ? ? wordList = textParse(open('email/ham/%d.txt' % i).read())
? ? ? ? docList.append(wordList)
? ? ? ? fullText.extend(wordList)
? ? ? ? classList.append(0)
? ? vocabList = createVocabList(docList)
? ? trainingSet = range(50); testSet=[]
? ? for i in range(10):
? ? ? ? randIndex = int(random.uniform(0, len(trainingSet)))
? ? ? ? testSet.append(trainingSet[randIndex])
? ? ? ? del(trainingSet[randIndex])
? ? trainMat = []; trainClasses = []
? ? for docIndex in trainingSet:
? ? ? ? trainMat.append(setofWords2Vec(vocabList, docList[docIndex]))
? ? ? ? trainClasses.append(classList[docIndex])
? ? p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))
? ? errorCount = 0
? ? for docIndex in testSet:
? ? ? ? wordVector = setofWords2Vec(vocabList, docList[docIndex])
? ? ? ? if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
? ? ? ? ? ? errorCount += 1
? ? print('the error rate is:', float(errorCount)/len(testSet))
spamTest()
def calcMostFreq(vocabList, fullText):
? ? import operator
? ? freqDict = {}
? ? for token in vocabList:
? ? ? ? freqDict[token] = fullText.count(token)
? ? sortedFreq = sorted(freqDict.iteritems(), key=operator.itemgetter(1), reverse=True)
? ? return sortedFreq[:30]
def localWords(feed1, feed0):
? ? import feedparser
? ? docList=[]; classList=[]; fullText=[]
? ? minLen = min(len(feed1['entries']),len(feed0['entries']))
? ? for i in range(minLen):
? ? ? ? wordList = textParse(feed1['entries'][i]['summary'])
? ? ? ? docList.append(wordList)
? ? ? ? classList.append(1)
? ? ? ? wordList = textParse(feed0['entries'][i]['summary'])
? ? ? ? docList.append(wordList)
? ? ? ? fullText.extend(wordList)
? ? ? ? classList.append(0)
? ? vocabList = createVocabList(docList)
? ? top30Words = calcMostFreq(vocabList, fullText)
? ? for pairW in top30Words:
? ? ? ? if pairW[0] in vocabList:
? ? ? ? ? ? vocabList.remove(pairW[0])
? ? trainingSet = range(2*minLen); testSet=[]
? ? for i in range(20):
? ? ? ? randIndex = int(random.uniform(0,len(trainingSet)))
? ? ? ? testSet.append(trainingSet[randIndex])
? ? ? ? del(trainingSet[randIndex])?
? ? trainMat=[]; trainClasses=[]
? ? for docIndex in trainingSet:
? ? ? ? trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
? ? ? ? trainClasses.append(classList[docIndex])
? ? p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))
? ? errorCount = 0
? ? for docIndex in testSet:
? ? ? ? wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
? ? ? ? if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList:
? ? ? ? ? ? errorCount += 1
? ? print('the error rate is:', float(errorCount)/len(testSet))
? ? return vocabList, p0V, p1V
import feedparser
ny=feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
sf=feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')
def getTopWords(ny,sf):
? ? import operator
? ? vocabList, p0V, p1V = localWords(ny, sf)
? ? topNY=[]; topSF=[]
? ? for i in range(len(p0V)):
? ? ? ? if p0V > -6.0 :topSF.append((vocabList[i], p0V[i]))
? ? ? ? if p1V > -6.0 :topNY.append((vocabList[i], p1V[i]))
? ? sortedSF = sorted(topSF, key=lambda pair: pair[i], reverse=True)
? ? print('SF**SF**SF**SF**SF**')
? ? for item in sortedSF:
? ? ? ? print(item[0])
? ? sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)
? ? print('NY**NY**NY**NY**NY**')
? ? for item in sortedNY:
? ? ? ? print(item[0])
getTopWords(ny, sf)
localWords(ny, sf)