coding=utf-8
from selenium import webdriver
from bs4 import BeautifulSoup
import pymongo
import xlwt
main function entry:
if name == 'main':
print("準備開始爬取目標網(wǎng)頁!")
print('----------------------------------------------------------')
URL = "http://typhoon.weather.com.cn/gis/typhoon_p.shtml"
browser = webdriver.Firefox() # 創(chuàng)建瀏覽器對象
browser.get(URL) # 打開頁面,并加載內(nèi)容,渲染對象
response = browser.page_source # 獲取頁面的html源碼
sourceData = response.encode()
fobj = open("data2.txt", 'wb')
fobj.write(sourceData)
fobj.close()
print("網(wǎng)頁數(shù)據(jù)爬取完畢!")
print('----------------------------------------------------------')
print("開始數(shù)據(jù)處理!")
print('----------------------------------------------------------')
f = open("./data2.txt", encoding='UTF-8')
lines = f.readlines()
#print(str(lines))
soup = BeautifulSoup( str(lines),'lxml')
#print(soup.prettify())
#只輸出第一個span標簽的內(nèi)容 .string表示輸出標簽內(nèi)的內(nèi)容
#print(soup.span.string)
#.contents .children表示輸出該節(jié)點下的所有自己節(jié)點
#print(soup.span.contents)
#.next_siblings .previous_siblings表示輸出該節(jié)點前后節(jié)點
#print(soup.span.next)
#print(soup.span.previous)
#findall找出所有匹配
#print(soup.find_all('span')[163].string)
#print(len(soup.find_all('span')))
#print(soup.find_all('span')[0].string[0])
#print(len(soup.find_all('span'))/4)
#建立數(shù)組添加標題
a = []
b = []
c = []
a.append('時間')
a.append('經(jīng)緯度')
a.append('氣壓(hPa)')
a.append('風速(m/s)')
#將所有符合初步篩選結(jié)果的數(shù)據(jù)填入數(shù)組
for i in range(int(len(soup.find_all('span'))/4)):
#if soup.find_all('span')[i*4].string[0] == True:
#if soup.find_all('span')[i * 4].string[0].equal(2):
a.append(soup.find_all('span')[i * 4].string)
a.append(soup.find_all('span')[i * 4 + 1].string)
a.append(soup.find_all('span')[i * 4 + 2].string)
a.append(soup.find_all('span')[i * 4 + 3].string)
#print(i)
#保存數(shù)據(jù)標題
c.append(a[0])
c.append(a[2])
c.append(a[3])
c.append('經(jīng)度')
最后篩選獲得需要的臺風數(shù)據(jù)
for j in range(int((len(a)/4))):
#print('j',j)
#print(a[0])
if a[0]:
#print(111)
#print('a[j][0]///////',a[j][0])
if a[0][0] != '2':
#print(222)
a.pop(0)
if a[0]:
if a[0][0] == '2':
b.append(a[0])
a.pop(0)
if a[0]:
if a[0][0] == '2':
b.append(a[0])
a.pop(0)
if a[0]:
if a[0][0] == '2':
b.append(a[0])
a.pop(0)
if a[0]:
if a[0][0] == '2':
b.append(a[0])
#print('------------------------------')
#print(a[0])
#print(a[1])
#print(a[2])
#print(a[3])
#print('------------------------------')
else:
#print(333)
a.pop(0)
if a[0]:
if a[0][0] == '2':
b.append(a[0])
a.pop(0)
if a[0]:
if a[0][0] == '2':
b.append(a[0])
a.pop(0)
if a[0]:
if a[0][0] == '2':
b.append(a[0])
a.pop(0)
if a[0]:
if a[0][0] == '2':
b.append(a[0])
#print('------------------------------')
#print(a[0])
#print(a[1])
#print(a[2])
#print(a[3])
#print('------------------------------')
最終的數(shù)據(jù)處理補丁防止html的格式出現(xiàn)變化只保留所需的信息:注保留了一個多余的數(shù)據(jù)作為臺風登陸的標記信息,樣例如下:
時間 經(jīng)緯度 氣壓(hPa) 風速(m/s)
2018-07-11 10時 26.3N/119.5E 970 35
2018-07-11 09時 26.40000N/119.85000E null null
2018-07-11 09時 26.4N/119.9E 960 42
第二條多余的無效信息則為臺風登陸的時間和地點信息
for k in range(int((len(a)))):
if not a[k]:
a.pop(0)
if a[k] and not a[k + 1]:
a.pop(0)
if a[k][0] == '2' and a[k + 4][0] == '2':
break
else:
a.pop(0)
for k in range(int((len(a)))):
#if not a[k]:
a.pop(0)
#if a[k][0] == '2' and a[k + 4][0] == '2':
#break
#else:
#a.pop(0)
臨時數(shù)據(jù)存放為了分開爬取數(shù)據(jù)中的經(jīng)緯度
jd = []
wd = []
for i in range(int((len(a) / 4))):
jd1 = []
wd1 = []
flag = 0
for j in range(int(len(a[i * 4 + 1]))):
if a[i * 4 + 1][j] == '/':
flag = 1
continue
else:
if a[i * 4 + 1][j] == 'N':
continue
else:
if flag == 0:
wd1.append(a[i * 4 + 1][j])
if a[i * 4 + 1][j] == 'E':
continue
else:
if flag == 1:
jd1.append(a[i * 4 + 1][j])
if int(len(jd1)) == 3:
# print(jd1[0],' ',jd1[1],' ',jd1[2],' ')
jd.append(int(jd1[0]) * 100 + int(jd1[1]) * 10 + int(jd1[2]))
else:
jd.append(int(jd1[0]) * 100 + int(jd1[1]) * 10 + int(jd1[2]) + int(jd1[4]) * 0.1)
if int(len(wd1)) == 2:
wd.append(int(wd1[0]) * 10 + int(wd1[1]))
else:
wd.append(int(wd1[0]) * 10 + int(wd1[1]) + int(wd1[3]) * 0.1)
# print('jd',jd)
# print('wd',wd)
print('jd',jd)
print(len(jd))
print('wd',wd)
print(len(wd))
print("數(shù)據(jù)處理完畢!")
print('----------------------------------------------------------')
打印輸出最后的結(jié)果
print(len(a))
for p in range(len(b)):
if '活動中' in b[p]:
print(b[p])
print(" ", '時間', " ",'經(jīng)緯度', " ", '氣壓(hPa)', " ", '風速(m/s)')
for k in range(int((len(a)/4))):
#print('wwwwwwwwwwww')
print(a[k4]," ",a[k4+1]," ",a[k4+2]," ",a[k4+3])
f.close()
將數(shù)存入excle中
print('----------------------------------------------------------')
workbook = xlwt.Workbook(encoding='utf-8')
booksheet = workbook.add_sheet('Sheet 1', cell_overwrite_ok=True)
寫入標題
for i in range(5):
booksheet.write(0,i,c[i])
寫入數(shù)據(jù)
for j in range(int((len(a)/4))):
if a[j*4+2] != "null":
booksheet.write(j + 1, 0, a[j * 4])
booksheet.write(j + 1, 3, jd[j])
booksheet.write(j + 1, 4, wd[j])
booksheet.write(j + 1, 1, a[j*4+2])
booksheet.write(j + 1, 2, a[j*4+3])
else:
continue
workbook.save('臺風.xls')
print("數(shù)據(jù)已成功導(dǎo)入Excle中!")
將數(shù)存入mongodb中
print('----------------------------------------------------------')
client = pymongo.MongoClient()
db = client.mydb # 連接mydb數(shù)據(jù)庫,沒有則自動創(chuàng)建
my_set = db.taifeng_set # 使用集合,沒有則自動創(chuàng)建
my_set.remove()#先清空表中原有的內(nèi)容防止重復(fù)
for q in range(int((len(a)/4))):
my_set.insert({"時間": a[q4], "經(jīng)度": jd[q], "緯度": wd[q],"氣壓(hPa)": a[q4+2], "風速(m/s)": a[q*4+3]})
for i in my_set.find():
print(i)
print('----------------------------------------------------------')
print("數(shù)據(jù)已成功導(dǎo)入Mongodb中!")
print('----------------------------------------------------------')
print("end...")