Week2_Practice3

getHref.py

函數(shù)功能:獲取所有的租房鏈接


from getMainPageInformation import *
from bs4 import  BeautifulSoup
import requests


url1='http://bj.xiaozhu.com/'
urls=['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(2,20,1)]

sourceData=[]
hreflist=MainPageInformation(url1)
for i in urls:
   MainPageInformation(i)


getMainPageInformation.py

函數(shù)功能:獲取租房鏈接

from bs4 import  BeautifulSoup
import requests
import pymongo

client=pymongo.MongoClient('localhost',27017)
Xiaozhu=client['Xiaozhu']
Href=Xiaozhu['href']

def MainPageInformation(url):
    self_url=url
    pageData=requests.get(self_url)
    data=BeautifulSoup(pageData.text,'lxml')
    href=data.select(' ul > li > a[class="resule_img_a"]')

    for i in href:
        data={
            'href':i.get('href'),
        }
        Href.insert_one(data)



getPageInformation.py

函數(shù)功能:獲取租房信息(可斷點續(xù)接)

from bs4 import  BeautifulSoup
import requests
import pymongo

client=pymongo.MongoClient('localhost',27017)
Xiaozhu=client['Xiaozhu']
Information=Xiaozhu['Information']
Href=Xiaozhu['href']

def getPageInformation(url):
    self_url=url
    self_pageData=requests.get(url)
    self_data=BeautifulSoup(self_pageData.text,'lxml')
    # print(self_data)
    titles=self_data.select('div.pho_info > h4 > em')
    roomImages=self_data.select('#curBigImage')
    prices=self_data.select("div.day_l > span")
    addresses=self_data.select('div.pho_info > p > span.pr5')
    hosterImages=self_data.select('div.member_pic > a > img')
    hosterName=self_data.select('div.w_240 > h6 > a')
    # print(titles)
    data={}
    for title,roomImage,price,address,hosterImage,name in zip(titles,roomImages,prices,addresses,hosterImages,hosterName):
        data={
            'title':title.get_text(),
            'roomImage':roomImage.get('src'),
            'price':price.get_text(),
            'address':address.get_text().strip(),
            'hosterImage':hosterImage.get('src'),
            'hosterName':name.get_text(),
            'href':url
        }
    Information.insert_one(data)

def getInformation():
    x=Href.find()
    y=Information.find(fields={'href':True,'_id':False})
    x_href=set(x)
    y_href=set(y)
    z=x-y
    for i in z:
        getPageInformation(i['href'])
    # print(titles)
    # print("-------------------------------------------")
    # print(roomImages)
    # print("-------------------------------------------")
    # print(price)
    # print("-------------------------------------------")
    # print(address)
    # print("-------------------------------------------")
    # print(hosterImage)



# url='http://bj.xiaozhu.com/fangzi/269024901.html'
getInformation()
最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容