今天在試圖爬蟲新聞網(wǎng)站時想把網(wǎng)頁上的所有url抓下來并歸類,所以寫了幾行小代碼。
為了促進自己學(xué)習(xí),就把簡陋的代碼放在簡書上發(fā)布,也當是一份學(xué)習(xí)筆記。
# -*- coding: utf-8 -*-
"""
Created on Sun Jul 3 00:31:06 2016
@author: Cy
"""
import requests
from bs4 import BeautifulSoup
def getallurl(url=r'http://www.sina.com.cn'):
r=requests.get(url)
s=BeautifulSoup(r.content,'lxml')
atag=s.find_all('a')
listurl=[]
for each in atag:
try:
listurl.append([each['href'],each.text])
except:
listurl.append(['',each.text])
return listurl
if __name__ == '__main__':
#inputurl=str(input("input the url: \n"))
#listurl=getallurl(inputurl)
listurl=getallurl()
for i in range(len(listurl)):
print("The %sth url is: %s, and the titileis: %s \n" % (i,listurl[i][0],listurl[i][1]))