from selenium import webdriver
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.options import Options
import requests
from bs4 import BeautifulSoup
import csv
import ssl
import re
import time
ssl._create_default_https_context = ssl._create_unverified_context
def get_newURL(surname):
if __name__ == "__main__":
options = Options()
options.add_argument('-headless')
global browser
browser = webdriver.Firefox(executable_path=r"C:\Users\weimengxin\Desktop\geckodriver.exe", firefox_options=options)
browser.get('http://search.library.sh.cn/jiapu/bSearch.htm')
input_str = browser.find_element_by_name('expr')
input_str.send_keys(surname)
browser.find_element_by_xpath("http://*[@value='檢索']").click()
time.sleep(1)
browser.switch_to.window(browser.window_handles[1])
global newurl
newurl = browser.current_url
browser.quit()
def get_next_page(new_url):
if __name__ == "__main__":
options = Options()
options.add_argument('-headless')
global browser_1
browser_1 = webdriver.Firefox(executable_path=r"C:\Users\weimengxin\Desktop\geckodriver.exe", firefox_options=options)
browser_1.get(new_url)
browser_1.find_element_by_xpath("http://*[@value='下頁(yè)']").click()
browser_1.switch_to.window(browser_1.window_handles[0])
global url_new
url_new = browser_1.current_url
browser_1.quit()
return get_next_page(url_new)
def get_current_data(url):
session = requests.Session()
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5)"
"AppleWebKit 537.36 (KHTML, like Gecko) Chrome",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}
html = session.get(url, headers=headers)
html.encoding = "GBK"
html_code = html.text
bs = BeautifulSoup(html_code, "html.parser")
content_list = bs.find_all("td")
content_list_1 = bs.find("H3")
global data
data = []
try:
for content in content_list:
content = content.get_text()
data.append(content)
for i in data:
if i == '' or i == '*' or i == ' ':
data.remove(i)
except:
content = content_list_1
data.append(content)
with open(r"C:\Users\weimengxin\Desktop\surname.csv", "rt") as sur:
cin = csv.reader(sur)
surname = [i for i in cin]
surname_dict = dict(surname)
surname_dict.pop("")
def get_total_url():
global urls_dict
urls_dict = {}
urls = []
for t in surname_dict:
index = surname_dict[t] + "氏"
print("查詢%s第1頁(yè)數(shù)據(jù)"%index)
get_newURL(index)
urls.append(newurl)
try:
print("查詢%s第2頁(yè)數(shù)據(jù)"%index)
get_next_page(newurl)
urls.append(url_new)
except:
print("%s僅一頁(yè)數(shù)據(jù)!"%index)
continue
count = 2
while True:
try:
count += 1
print("查詢%s第%d頁(yè)數(shù)據(jù)" % (index, count))
get_next_page(url_new)
urls.append(url_new)
except:
print("%s收集完畢!"%index)
urls_dict[index] = urls
try:
browser.quit()
browser_1.quit()
except:
continue
# surname_set = {}
# for t in surname_dict:
# get_newURL(surname_dict[t] + "氏")
# print("現(xiàn)在自動(dòng)檢索" + surname_dict[t] + "氏數(shù)據(jù)")
# print("-------------------------------------")
# get_current_data(newurl)
# all_data = data.copy()
#
# # 計(jì)算需要爬取的網(wǎng)頁(yè)數(shù)
# try:
# total = all_data[1]
# pattern = re.compile('[0-9]+')
# match = pattern.search(total)
# total_true = int(match.group())
# sheets = total_true//10 + 1
# except:
# print("Notice: 本次檢索未命中記錄!")
# continue
# print("正在獲取第1頁(yè)數(shù)據(jù)... (總共%d頁(yè))" % sheets)
#
# try:
# get_next_page(newurl)
# print("正在獲取第2頁(yè)數(shù)據(jù)... (總共%d頁(yè))" % sheets)
# get_current_data(url_new)
# all_data.extend(data)
# except:
# print("Notice: 僅1頁(yè)數(shù)據(jù)")
# surname_set[surname_dict[t]] = all_data
# browser_1.close()
# continue
#
# count = 2
# while True:
# try:
# get_next_page(url_new)
# get_current_data(url_new)
# count += 1
# print("正在獲取第%d的數(shù)據(jù)... (總共%d頁(yè))" % (count, sheets))
# all_data.extend(data)
# except:
# surname_set[surname_dict[t]] = all_data
# break
# print("爬取" + surname_dict[t] + "氏完成 ?。?!")
# print("--------------------------")
# continue
2018-01-17
最后編輯于 :
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請(qǐng)結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡(jiǎn)書系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請(qǐng)結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡(jiǎn)書系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。
相關(guān)閱讀更多精彩內(nèi)容
- 馬太福音第17章 耶穌登山變像,向三位門徒顯榮,是為了歸正他們的信心。門徒因?yàn)橹粚W⒒揭茈y,而忽略基督預(yù)言自己...
- 南懷瑾:“能控制早晨的人,方可控制人生。 富蘭克林:“我未曾見過一個(gè)早起勤奮謹(jǐn)慎誠(chéng)實(shí)的人抱怨命運(yùn)不好?!?你沉醉于...
- 騎兵,顧名思義:既可徒步保障飛機(jī),又可開車征戰(zhàn)機(jī)場(chǎng)。以前總覺得騎兵相比步兵要帥的多,因?yàn)樗麄儾挥每績(jī)蓷l腿提前十幾分...