(python2.7)實現(xiàn)糗百爬蟲

先貼代碼,文章以后有空再寫,注釋也以后再寫,bug也以后再調,參考文獻也以后在貼吧,就這樣了(葛優(yōu)躺)

文件1:main.py

# -*- coding:utf-8 -*-
import urllib
import qsbk

spider = qsbk.QsbkSpider()
spider.section='8hr'
spider.loadSomePages(10)
while True:
    article = spider.getRandomArticle()
    if not article:
        break
    print '[ page',article['pageIndex'],'artical',article['articleIndex'],']\n',\
        '< Article by', article['author'], '>\n', article['text'],'\n< God Comment >\n',\
        article['cmtMan'], article['cmt']
    print 'pause enter to get next article'
    input = raw_input()
    if(input in ['q','Q']):
        break

文件2:qsbk.py

__author__ = 'ssins'
# -*- coding:utf-8 -*-
import urllib
import urllib2
import re
import random
from bs4 import BeautifulSoup

class QsbkSpider:
    def __init__(self):
        self._pageIndex = 1
        self.maxPageIndex = 35
        self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        self._qsbkUrl = 'http://www.qiushibaike.com/'
        self.section = '8hr'
        self._sections = ['8hr','hot','imgrank','text','history','pic','textnew']

        self._headers = {'User-Agent' : self.user_agent}
        self._stories = []
        self.enable = False

    def getPageUrl(self,section,pageIndex):
        if section not in self._sections or pageIndex < 1 or pageIndex > self.maxPageIndex:
            return None
        url = self._qsbkUrl + section + '/page/' + str(pageIndex)
        return url

    def getPageInfo(self, url):
        try:
            request = urllib2.Request(url, headers = self._headers)
            response = urllib2.urlopen(request)
            html = response.read()
            return html
        except:
            return None

    def find_article_span(self,tag):
        if tag.name != 'span':
            return False
        children = tag.children
        for child in children:
            if (child.name in ['img', 'h2']):
                return False
        return True

    def getPageArticles(self,section,pageIndex):
        pageCode = self.getPageInfo(self.getPageUrl(section,pageIndex))
        if not pageCode:
            return None
        pageCode = str(pageCode)
        soup = BeautifulSoup(pageCode, 'lxml')
        #soup = BeautifulSoup(pageCode, 'html.parser')
        articles = soup.find_all('div', class_='article block untagged mb15')
        articlesDictionaryList = []
        try:
            for tmpArt in articles:
                article = str(tmpArt)
                if re.search('class="thumb"', article):
                    continue
                replaceBr = re.compile('<br/>')
                article = re.sub(replaceBr, "\n", article)
                soupArticle = BeautifulSoup(article, 'lxml')
                #soupArticle = BeautifulSoup(article, 'html.parser')
                author = soupArticle.h2.string
                text = soupArticle.find(self.find_article_span).string
                cmtMan = 'no God Comment'
                cmt = ''
                try:
                    cmtMan = soupArticle.find('span', class_='cmt-name').string
                    cmt = soupArticle.find('div', class_='main-text').string
                except:
                    pass
                articlesDictionary = {}
                articlesDictionary['author'] = author
                articlesDictionary['text'] = text
                articlesDictionary['cmtMan'] = cmtMan
                articlesDictionary['cmt'] = cmt
                articlesDictionaryList.append(articlesDictionary)
            self._stories.append(articlesDictionaryList)
        except:
            return False
        return True

    def loadNextPage(self):
        if(self._pageIndex > self.maxPageIndex):
            return False
        if(self.getPageArticles(self.section,self._pageIndex)):
            self._pageIndex += 1
            return True
        return False

    def loadSomePages(self, pageNums):
        for i in range(pageNums):
            self.loadNextPage();

    def getRandomArticle(self):
        if(len(self._stories)<1):
            return None
        pageIndex = random.randint(0, len(self._stories) - 1)
        articleIndex = random.randint(0, len(self._stories[pageIndex]) - 1)
        article = self._stories[pageIndex][articleIndex]
        article['pageIndex'] = pageIndex + 1
        article['articleIndex'] = articleIndex + 1
        return article
最后編輯于
?著作權歸作者所有,轉載或內容合作請聯(lián)系作者
【社區(qū)內容提示】社區(qū)部分內容疑似由AI輔助生成,瀏覽時請結合常識與多方信息審慎甄別。
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發(fā)布,文章內容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務。

相關閱讀更多精彩內容

  • 黑云又來了! ??!黑云又來了! ?。?!黑云! 繚繞著塵滓, 彌漫著瘴毒, 向著我的頭頂壓過來。 我看到, 地獄之門...
    石逸軒閱讀 260評論 0 2
  • 鄭毅聽到賈老師在衛(wèi)生間踢水盆的聲音,略一停頓,張了張嘴,可還是把到了嘴邊的怒火壓了下去。算了,一大清早,不惹氣了。...
    樵砥閱讀 215評論 2 3
  • 那啥綜合癥又犯了,為什么都不理解,一定要問個為什么呢?
    _獨家記憶閱讀 333評論 0 1
  • 一、 孤獨就像人說的那樣 最后走的人關門最輕 ——網易云熱評 二、 小時候 畫在手上的表沒有動 卻帶走了我們最好的...
    一條芒狗閱讀 216評論 0 3

友情鏈接更多精彩內容