代碼人生的記憶---2018-07-12

def textParse(bigString):

? ? import re

? ? listOfTokens = re.split(r'\W*', bigString)

? ? return [tok.lower() for tok in listOfTokens if len(tok) > 2]

def spamTest():

? ? docList = []; classList = [];fullText =? []

? ? for i in range(1,26):

? ? ? ? wordList = textParse(open('email/spam/%d.txt' % i).read())

? ? ? ? docList.append(wordList)

? ? ? ? fullText.extend(wordList)

? ? ? ? classList.append(1)

? ? ? ? wordList = textParse(open('email/ham/%d.txt' % i).read())

? ? ? ? docList.append(wordList)

? ? ? ? fullText.extend(wordList)

? ? ? ? classList.append(0)

? ? vocabList = createVocabList(docList)

? ? trainingSet = range(50); testSet=[]

? ? for i in range(10):

? ? ? ? randIndex = int(random.uniform(0, len(trainingSet)))

? ? ? ? testSet.append(trainingSet[randIndex])

? ? ? ? del(trainingSet[randIndex])

? ? trainMat = []; trainClasses = []

? ? for docIndex in trainingSet:

? ? ? ? trainMat.append(setofWords2Vec(vocabList, docList[docIndex]))

? ? ? ? trainClasses.append(classList[docIndex])

? ? p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))

? ? errorCount = 0

? ? for docIndex in testSet:

? ? ? ? wordVector = setofWords2Vec(vocabList, docList[docIndex])

? ? ? ? if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:

? ? ? ? ? ? errorCount += 1

? ? print('the error rate is:', float(errorCount)/len(testSet))

spamTest()

def calcMostFreq(vocabList, fullText):

? ? import operator

? ? freqDict = {}

? ? for token in vocabList:

? ? ? ? freqDict[token] = fullText.count(token)

? ? sortedFreq = sorted(freqDict.iteritems(), key=operator.itemgetter(1), reverse=True)

? ? return sortedFreq[:30]

def localWords(feed1, feed0):

? ? import feedparser

? ? docList=[]; classList=[]; fullText=[]

? ? minLen = min(len(feed1['entries']),len(feed0['entries']))

? ? for i in range(minLen):

? ? ? ? wordList = textParse(feed1['entries'][i]['summary'])

? ? ? ? docList.append(wordList)

? ? ? ? classList.append(1)

? ? ? ? wordList = textParse(feed0['entries'][i]['summary'])

? ? ? ? docList.append(wordList)

? ? ? ? fullText.extend(wordList)

? ? ? ? classList.append(0)

? ? vocabList = createVocabList(docList)

? ? top30Words = calcMostFreq(vocabList, fullText)

? ? for pairW in top30Words:

? ? ? ? if pairW[0] in vocabList:

? ? ? ? ? ? vocabList.remove(pairW[0])

? ? trainingSet = range(2*minLen); testSet=[]

? ? for i in range(20):

? ? ? ? randIndex = int(random.uniform(0,len(trainingSet)))

? ? ? ? testSet.append(trainingSet[randIndex])

? ? ? ? del(trainingSet[randIndex])?

? ? trainMat=[]; trainClasses=[]

? ? for docIndex in trainingSet:

? ? ? ? trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))

? ? ? ? trainClasses.append(classList[docIndex])

? ? p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))

? ? errorCount = 0

? ? for docIndex in testSet:

? ? ? ? wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])

? ? ? ? if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList:

? ? ? ? ? ? errorCount += 1

? ? print('the error rate is:', float(errorCount)/len(testSet))

? ? return vocabList, p0V, p1V

import feedparser

ny=feedparser.parse('http://newyork.craigslist.org/stp/index.rss')

sf=feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')

def getTopWords(ny,sf):

? ? import operator

? ? vocabList, p0V, p1V = localWords(ny, sf)

? ? topNY=[]; topSF=[]

? ? for i in range(len(p0V)):

? ? ? ? if p0V > -6.0 :topSF.append((vocabList[i], p0V[i]))

? ? ? ? if p1V > -6.0 :topNY.append((vocabList[i], p1V[i]))

? ? sortedSF = sorted(topSF, key=lambda pair: pair[i], reverse=True)

? ? print('SF**SF**SF**SF**SF**')

? ? for item in sortedSF:

? ? ? ? print(item[0])

? ? sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)

? ? print('NY**NY**NY**NY**NY**')

? ? for item in sortedNY:

? ? ? ? print(item[0])

getTopWords(ny, sf)

localWords(ny, sf)

?著作權歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務。

相關閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容