緩存就是將頁(yè)面信息下載下來(lái),避免二次下載
import os
import requests
from pyquery import PyQuery as pq
"""
這個(gè)爬蟲(chóng)可以爬 10 個(gè)頁(yè)面, 把所有 TOP250 電影都爬出來(lái)
并且加入了緩存頁(yè)面功能
再也不用重復(fù)請(qǐng)求了(網(wǎng)絡(luò)請(qǐng)求很浪費(fèi)時(shí)間)
這樣做有兩個(gè)好處
1, 增加新內(nèi)容(比如增加評(píng)論人數(shù))的時(shí)候不用重復(fù)請(qǐng)求網(wǎng)絡(luò)
2, 出錯(cuò)的時(shí)候有原始數(shù)據(jù)對(duì)照(比如 消失的愛(ài)人 沒(méi)有 quote)
"""
class Model():
def __repr__(self):
name = self.__class__.__name__
properties = ('{}=({})'.format(k, v) for k, v in self.__dict__.items())
s = '\n<{} \n {}>'.format(name, '\n '.join(properties))
return s
class Movie(Model):
def __init__(self):
self.name = ''
self.other = ''
self.score = 0
self.quote = ''
self.cover_url = ''
self.ranking = 0
def cached_page(url):
folder = 'cacheddouban'
if not os.path.exists(folder):
os.makedirs(folder)
# https://movie.douban.com/top250?start=100
filename = '{}.html'.format(url.split('=', 1)[-1])
path = os.path.join(folder, filename)
print(path)
if os.path.exists(path):
with open(path, 'rb') as f:
s = f.read()
return s
else:
# 發(fā)送網(wǎng)絡(luò)請(qǐng)求, 把結(jié)果寫(xiě)入到文件夾中
r = requests.get(url)
with open(path, 'wb') as f:
f.write(r.content)
return r.content
- 如果不存在目標(biāo)文件夾,則新建文件夾
- os.path 返回當(dāng)前文件的容器路徑
- 建立文件存儲(chǔ)頁(yè)面信息,文件名根據(jù)當(dāng)前 url 來(lái)
- os.path.join() 定位到當(dāng)前文件路徑
- 如果存在當(dāng)前路徑,讀取返回就行
- 如果不存在,就將獲取的頁(yè)面內(nèi)容存進(jìn)去
注意:
s == r.content
def movie_from_div(div):
e = pq(div)
m = Movie()
m.name = e('.title').text()
m.other = e('.other').text()
m.score = e('.rating_num').text()
m.quote = e('.inq').text()
m.cover_url = e('img').attr('src')
m.ranking = e('.pic').find('em').text()
return m
def movies_from_url(url):
page = cached_page(url)
e = pq(page)
items = e('.item')
# 調(diào)用 movie_from_div
movies = [movie_from_div(i) for i in items]
return movies
def main():
for i in range(0, 250, 25):
url = 'https://movie.douban.com/top250?start={}'.format(i)
movies = movies_from_url(url)
print('top250 movies', movies)
if __name__ == '__main__':
main()