分成兩部分,存儲(chǔ)到mongodb和查找,較快完成,不過(guò)select的查找過(guò)長(zhǎng),下次爭(zhēng)取用較短的元素定位。
我的成果

屏幕快照 2016-09-01 下午2.28.01.jpg
我的代碼
from bs4 import BeautifulSoup
import requests
from pymongo import MongoClient
client=MongoClient('localhost',27017)
xiaozhu=client['xiaozhu']
page_list=xiaozhu['page_list']
apartment=xiaozhu['apartment']
def get_links():
urls=['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(i) for i in range(1,4)]
for url in urls:
wb_data=requests.get(url)
soup=BeautifulSoup(wb_data.text,'lxml')
links=soup.select('#page_list > ul > li > div.result_btm_con.lodgeunitname')
for link in links:
page_list.insert_one({'url':link.get('detailurl')})
def get_info(url):
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
title = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em')
area = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span')
price = soup.select('#pricePart > div.day_l > span')
data = {
'title': title[0].get_text(),
'area': area[0].get_text() if soup.find_all('span','pr5') else None,
'price': price[0].get_text(),
}
apartment.insert_one(data)
print('done')
# get_links()
# for i in page_list.find():
# get_info(i['url'])
for item in apartment.find():
if int(item['price']) >=500:
print(item['title'],item['area'])
總結(jié)
- 存儲(chǔ)價(jià)格的時(shí)候是字符型了,所以查找的時(shí)候需要轉(zhuǎn)化
- 簡(jiǎn)化定位元素