欧美日韩中出,518精品在线

offensive.py(爬取項目歷史更新內(nèi)容)

#!/usr/bin/env python

# -*- coding:utf-8 -*-

import re

import time

import urllib.request

import conf as cf

BASE_URL = 'https://github.com/offensive-security/exploitdb/releases'

DOWNLOAD_LINK_PATTERN = 'href="(.*?)zip" rel="nofollow">'

FIRST_PATTERN = r'</span><a rel="nofollow" href="(.*?)">Next.*'

PAGE_PATTERN = r'>Previous</a><a rel="nofollow" href="(.*?)">Next.*'

class MyCrawler:

????def __init__(self, base_url=BASE_URL, start_page="first 1 page"):

????????self.base_url = base_url

????????self.start_page = start_page

????????# self.headers = apache_request_headers();

# 對首頁的爬取

????def first_page(self):

????????try:

????????????req = urllib.request.Request(self.base_url)

????????????html = urllib.request.urlopen(req)

????????????doc = html.read().decode('utf8', 'ignore')

????????????next_page = re.search(FIRST_PATTERN, doc, re.M | re.I)

????????????print('Now working on page = {}\n'.format(self.start_page))

????????????time.sleep(5)

????????????self.fetch_download_link(self.base_url)

????????????self.start_page = next_page.group(1)

????????????# re.search(r'after = (.*?) ">Next.*', next_page.group(1), re.M | re.I).group(1)

????????????self.base_url = next_page.group(1)

????????????# self.fetch_download_link(next_url)

????????except urllib.error.HTTPError as err:

????????????print(err.msg)

????????self.fetch_next_page()

# 翻頁

????def fetch_next_page(self):

????????while True:

????????????try:

????????????????req = urllib.request.Request(self.base_url)

????????????????html = urllib.request.urlopen(req)

????????????????doc = html.read().decode('utf8', 'ignore')

????????????????next_page = re.search(PAGE_PATTERN, doc, re.M | re.I)

????????????????print('Now working on page {}\n'.format(self.start_page))

????????????????time.sleep(5)

????????????????#翻頁時等待5秒

????????????????self.fetch_download_link(self.base_url)

????????????????self.start_page = next_page.group(1)

????????????????# re.search(r'after = (.*?) ">Next.*', next_page.group(1), re.M | re.I).group(1)

????????????????self.base_url = next_page.group(1)

????????????????# self.fetch_download_link(next_url)

????????????except urllib.error.HTTPError as err:

????????????????print(err.msg)

????????????????break

# 文件下載：將下載鏈接存到文件中

????def fetch_download_link(self, Aurl):

????????f = open('result.txt', 'a')

????????req = urllib.request.Request(Aurl)

????????html = urllib.request.urlopen(req)

????????doc = html.read().decode('utf8')

????????alist = list(set(re.findall(DOWNLOAD_LINK_PATTERN, doc)))

????????for item in alist:

????????????url = "https://github.com/" + item + "zip"

????????????print('Storing {}'.format(url))

????????????f.write(url + '\n')

????????????time.sleep(7)

????????f.close()

????def run(self):

????????self.fetch_download_link()

if __name__ == '__main__':

????mc = MyCrawler()

????mc.first_page()

text.py(監(jiān)控首頁更新，并爬取)

#!/usr/bin/env python

# -*- coding:utf-8 -*

from selenium import webdriver

import re

import time

import urllib.request

import conf as cf

BASE_URL = 'https://github.com/offensive-security/exploitdb/releases'

DOWNLOAD_LINK_PATTERN = 'href="(.*?)zip" rel="nofollow">'

FIRST_PATTERN = r'</span><a rel="nofollow" href="(.*?)">Next.*'

# 監(jiān)控項目首頁更新

def jiankong_page():

????print("star monitoring ")

????req = urllib.request.Request(BASE_URL)

????html = urllib.request.urlopen(req)

????doc = html.read().decode('utf8', 'ignore')

????next_page = re.search(FIRST_PATTERN, doc, re.M | re.I)

????flag_page = next_page.group(1)

????flag_list = []

# 首次抓取首頁項目url

????alist = list(set(re.findall(DOWNLOAD_LINK_PATTERN, doc)))

????for item in alist:

????????url = "https://github.com/" + item + "zip"

????????flag_list.append(url)

# 定時掃描監(jiān)控（5h/次）

????while True:

????????try:

????????????time.sleep(5 * 60* 60)

????????????req = urllib.request.Request(BASE_URL)

????????????html = urllib.request.urlopen(req)

????????????doc = html.read().decode('utf8', 'ignore')

????????????next_page = re.search(FIRST_PATTERN, doc, re.M | re.I)

# 判斷翻頁鏈接是否變化，來確定是否更新

????????????if next_page.group(1) != flag_page:

????????????????print("have update")

????????????????item = re.rearch(DOWNLOAD_LINK_PATTERN, doc, re.M | re.I)

????????????????#抓取第一個匹配的剛更新的項目url

????????????????new_url = "https://github.com/" + item.group(1) + "zip"

????????????????print("new url = " + new_url)

????????????????flag_list.append(new_url)

????????????????f = open('result.txt', 'a')

????????????????f.write(new_url + '\n')

????????????????f.close()

????????????????flag_page = next_page.group(1)

????????????else:

????????????????print("No update")

????????except urllib.error.HTTPError as err:

????????????print(err.msg)

????????????break

if __name__ == '__main__':

????jiankong_page()

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av

基于python爬蟲的github-exploitdb漏洞庫監(jiān)控與下載

基于python爬蟲的github-exploitdb漏洞庫監(jiān)控與下載

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九 欧美,1769亚洲,黄色成人av

基于python爬蟲的github-exploitdb漏洞庫監(jiān)控與下載

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av