99国产日韩新片网站,欧美亚洲韩国三级

搞死搞殘大眾點(diǎn)評,廢話不多說，源碼參上

import json

import re

from bs4 import BeautifulSoup

import time

from selenium import webdriver

import requests.models

import pandas as pd

from urllib.parse import urlencode

from threading import Thread

keyword = input('your keyword')

output_filename = input('output csv path') + '.csv'

post_url = 'https://m.dianping.com/isoapi/module'

true_url = 'https://m.dianping.com/shoplist/4/search?from=m_search&keyword={}'.format(keyword)

headers = {

? ? 'Connection': 'keep-alive',

? ? 'Content-Length': '234',

? ? 'Origin': 'https://m.dianping.com',

? ? 'user-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36',

? ? 'Content-Type': 'application/json',

? ? 'Accept': '*/*',

? ? 'Referer': 'https://m.dianping.com/shoplist/4/search?from=m_search&keyword=%E5%95%86%E5%9C%BA'

}

def get_cookies():

? ? """

? ? 使用selenium獲取true_url 的cookies

? ? :return: cookie

? ? """

? ? chromeOptions = webdriver.ChromeOptions()

? ? ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'

? ? chromeOptions.add_argument('user-agent="%s"' % ua)

? ? chromeOptions.add_argument('--proxy-server=http://127.0.0.1:8888')

? ? driver = webdriver.Chrome(chrome_options=chromeOptions)

? ? driver.get(true_url)

? ? for _ in range(15):

? ? ? ? driver.execute_script(

? ? ? ? ? ? 'window.scrollBy(0, 500)'

? ? ? ? )

? ? ? ? time.sleep(0.5)

? ? if '請輸入圖片中的內(nèi)容' in driver.page_source:

? ? ? ? input('12345')

? ? cookies = driver.get_cookies()

? ? dict_cookies = {cookie['name']: cookie['value'] for cookie in cookies}

? ? driver.quit()

? ? return dict_cookies

def get_data(dict_cookies):

? ? for p in range(0, 20 * 51, 20):

? ? ? ? data = {

? ? ? ? ? ? "pageEnName": "shopList",

? ? ? ? ? ? "moduleInfoList": [

? ? ? ? ? ? ? ? {

? ? ? ? ? ? ? ? ? ? "moduleName": "mapiSearch",

? ? ? ? ? ? ? ? ? ? "query": {

? ? ? ? ? ? ? ? ? ? ? ? "search": {

? ? ? ? ? ? ? ? ? ? ? ? ? ? "start": p,

? ? ? ? ? ? ? ? ? ? ? ? ? ? "categoryId": 0,

? ? ? ? ? ? ? ? ? ? ? ? ? ? "parentCategoryId": 0,

? ? ? ? ? ? ? ? ? ? ? ? ? ? "locateCityid": 0,

? ? ? ? ? ? ? ? ? ? ? ? ? ? "limit": 20,

? ? ? ? ? ? ? ? ? ? ? ? ? ? "sortId": 0,

? ? ? ? ? ? ? ? ? ? ? ? ? ? "cityId": 4,

? ? ? ? ? ? ? ? ? ? ? ? ? ? "keyword": '商場',

? ? ? ? ? ? ? ? ? ? ? ? ? ? "regionId": 0,

? ? ? ? ? ? ? ? ? ? ? ? ? ? "maptype": 0

? ? ? ? ? ? ? ? ? ? ? ? }

? ? ? ? ? ? ? ? ? ? }

? ? ? ? ? ? ? ? }

? ? ? ? ? ? ]

? ? ? ? }

? ? ? ? r = requests.post(post_url, headers=headers, cookies=dict_cookies, json=data, verify=False)

? ? ? ? json_data = r.json()

? ? ? ? datas = json_data['data']['moduleInfoList'][0]['moduleData']['data']['listData']['list']

? ? ? ? items = []

? ? ? ? for index, data in enumerate(datas):

? ? ? ? ? ? item = {}

? ? ? ? ? ? for k, v in data.items():

? ? ? ? ? ? ? ? if isinstance(v, list):

? ? ? ? ? ? ? ? ? ? continue

? ? ? ? ? ? ? ? elif isinstance(v, dict):

? ? ? ? ? ? ? ? ? ? continue

? ? ? ? ? ? ? ? else:

? ? ? ? ? ? ? ? ? ? v = v

? ? ? ? ? ? ? ? item[k] = v

? ? ? ? ? ? items.append(item)

? ? ? ? ? ? header = True if p == 0 and index == 0 else False

? ? ? ? ? ? print(item)

? ? ? ? ? ? df = pd.DataFrame(data=item, index=['0'])

? ? ? ? ? ? df.to_csv(output_filename, mode='a', index=False, header=header, encoding='utf_8_sig')

def read_csv():

? ? df = pd.read_csv(output_filename, error_bad_lines=False)

? ? for index, row in df.iterrows():

? ? ? ? print('ShopId: ' + str(row['shopId']))

if __name__ == '__main__':

? ? dict_cookies = get_cookies()

? ? threads = []

? ? t1 = Thread(target = get_data, args = (dict_cookies, ))

? ? t2 = Thread(target = read_csv)

? ? threads.append(t1)

? ? threads.append(t2)

? ? for index, t in enumerate(threads):

? ? ? ? if index == 1:

? ? ? ? ? ? time.sleep(15)

? ? ? ? t.start()

? ? for t in threads:

? ? ? ? t.join()

解釋下，先使用selenium 打開m.dianding.com網(wǎng)站，關(guān)鍵詞已輸入的情況下，進(jìn)行翻頁，然后獲取到cookies，拿到cookies獲取接口數(shù)據(jù)，保存到outputfilename （csv文件中），需要提供的是關(guān)鍵詞與保存路徑，默認(rèn)是csv文件。

如果使用正常的selenium手法去訪問，必然會(huì)出現(xiàn)驗(yàn)證碼跟異常操作的問題。參考：http://m.itdecent.cn/p/304f4dfae0bb

使用mitmproxy作為中間代理，selenium通過代理進(jìn)行訪問，代理中對請求進(jìn)行過濾，過濾掉某些參數(shù)就可以防止這樣的反爬手段。filter_js.py 屏蔽代碼參上

import re

from mitmproxyimport ctx

def response(flow):

"""修改應(yīng)答數(shù)據(jù)"""

? ? if '/js/yoda.' in flow.request.url:

# 屏蔽selenium檢測

? ? ? ? for webdriver_keyin ['webdriver', '__driver_evaluate', '__webdriver_evaluate', '__selenium_evaluate',

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? '__fxdriver_evaluate', '__driver_unwrapped', '__webdriver_unwrapped',

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? '__selenium_unwrapped', '__fxdriver_unwrapped', '_Selenium_IDE_Recorder', '_selenium',

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? 'calledSelenium', '_WEBDRIVER_ELEM_CACHE', 'ChromeDriverw', 'driver-evaluate',

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? 'webdriver-evaluate', 'selenium-evaluate', 'webdriverCommand',

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? 'webdriver-evaluate-response', '__webdriverFunc', '__webdriver_script_fn',

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? '__$webdriverAsyncExecutor', '__lastWatirAlert', '__lastWatirConfirm',

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? '__lastWatirPrompt', '$chrome_asyncScriptInfo', '$cdc_asdjflasutopfhvcZLmcfl_']:

ctx.log.info('Remove "{}" from {}.'.format(webdriver_key, flow.request.url))

flow.response.text = flow.response.text.replace('"{}"'.format(webdriver_key), '"NO-SUCH-ATTR"')

flow.response.text = flow.response.text.replace('t.webdriver', 'false')

flow.response.text = flow.response.text.replace('ChromeDriver', '')

代理命令截圖參上

監(jiān)聽8888的端口，并自定義腳本filter_js屏幕selenium的檢測。在csv文件中，拿到了shopID，通過它構(gòu)建的url可以抓取詳情頁面數(shù)據(jù)，這個(gè)就比較簡單了。

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av

(爬蟲源碼全公開)征服所有網(wǎng)站之大眾點(diǎn)評

(爬蟲源碼全公開)征服所有網(wǎng)站之大眾點(diǎn)評

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九 欧美,1769亚洲,黄色成人av

(爬蟲源碼全公開)征服所有網(wǎng)站之大眾點(diǎn)評

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av