基于python爬蟲的github-exploitdb漏洞庫監(jiān)控與下載

offensive.py(爬取項目歷史更新內(nèi)容)

#!/usr/bin/env python

# -*- coding:utf-8 -*-

import re

import time

import urllib.request

import conf as cf

BASE_URL = 'https://github.com/offensive-security/exploitdb/releases'

DOWNLOAD_LINK_PATTERN = 'href="(.*?)zip" rel="nofollow">'

FIRST_PATTERN = r'</span><a rel="nofollow" href="(.*?)">Next.*'

PAGE_PATTERN = r'>Previous</a><a rel="nofollow" href="(.*?)">Next.*'

class MyCrawler:

????def __init__(self, base_url=BASE_URL, start_page="first 1 page"):

????????self.base_url = base_url

????????self.start_page = start_page

????????# self.headers = apache_request_headers();

# 對首頁的爬取

????def first_page(self):

????????try:

????????????req = urllib.request.Request(self.base_url)

????????????html = urllib.request.urlopen(req)

????????????doc = html.read().decode('utf8', 'ignore')

????????????next_page = re.search(FIRST_PATTERN, doc, re.M | re.I)

????????????print('Now working on page = {}\n'.format(self.start_page))

????????????time.sleep(5)

????????????self.fetch_download_link(self.base_url)

????????????self.start_page = next_page.group(1)

????????????# re.search(r'after = (.*?) ">Next.*', next_page.group(1), re.M | re.I).group(1)

????????????self.base_url = next_page.group(1)

????????????# self.fetch_download_link(next_url)

????????except urllib.error.HTTPError as err:

????????????print(err.msg)

????????self.fetch_next_page()

# 翻頁

????def fetch_next_page(self):

????????while True:

????????????try:

????????????????req = urllib.request.Request(self.base_url)

????????????????html = urllib.request.urlopen(req)

????????????????doc = html.read().decode('utf8', 'ignore')

????????????????next_page = re.search(PAGE_PATTERN, doc, re.M | re.I)

????????????????print('Now working on page {}\n'.format(self.start_page))

????????????????time.sleep(5)

????????????????#翻頁時等待5秒

????????????????self.fetch_download_link(self.base_url)

????????????????self.start_page = next_page.group(1)

????????????????# re.search(r'after = (.*?) ">Next.*', next_page.group(1), re.M | re.I).group(1)

????????????????self.base_url = next_page.group(1)

????????????????# self.fetch_download_link(next_url)

????????????except urllib.error.HTTPError as err:

????????????????print(err.msg)

????????????????break

# 文件下載:將下載鏈接存到文件中

????def fetch_download_link(self, Aurl):

????????f = open('result.txt', 'a')

????????req = urllib.request.Request(Aurl)

????????html = urllib.request.urlopen(req)

????????doc = html.read().decode('utf8')

????????alist = list(set(re.findall(DOWNLOAD_LINK_PATTERN, doc)))

????????for item in alist:

????????????url = "https://github.com/" + item + "zip"

????????????print('Storing {}'.format(url))

????????????f.write(url + '\n')

????????????time.sleep(7)

????????f.close()

????def run(self):

????????self.fetch_download_link()

if __name__ == '__main__':

????mc = MyCrawler()

????mc.first_page()

text.py(監(jiān)控首頁更新,并爬取)

#!/usr/bin/env python

# -*- coding:utf-8 -*

from selenium import webdriver

import re

import time

import urllib.request

import conf as cf

BASE_URL = 'https://github.com/offensive-security/exploitdb/releases'

DOWNLOAD_LINK_PATTERN = 'href="(.*?)zip" rel="nofollow">'

FIRST_PATTERN = r'</span><a rel="nofollow" href="(.*?)">Next.*'

# 監(jiān)控項目首頁更新

def jiankong_page():

????print("star monitoring ")

????req = urllib.request.Request(BASE_URL)

????html = urllib.request.urlopen(req)

????doc = html.read().decode('utf8', 'ignore')

????next_page = re.search(FIRST_PATTERN, doc, re.M | re.I)

????flag_page = next_page.group(1)

????flag_list = []

# 首次抓取首頁項目url

????alist = list(set(re.findall(DOWNLOAD_LINK_PATTERN, doc)))

????for item in alist:

????????url = "https://github.com/" + item + "zip"

????????flag_list.append(url)

# 定時掃描監(jiān)控(5h/次)

????while True:

????????try:

????????????time.sleep(5 * 60* 60)

????????????req = urllib.request.Request(BASE_URL)

????????????html = urllib.request.urlopen(req)

????????????doc = html.read().decode('utf8', 'ignore')

????????????next_page = re.search(FIRST_PATTERN, doc, re.M | re.I)

# 判斷翻頁鏈接是否變化,來確定是否更新

????????????if next_page.group(1) != flag_page:

????????????????print("have update")

????????????????item = re.rearch(DOWNLOAD_LINK_PATTERN, doc, re.M | re.I)

????????????????#抓取第一個匹配的 剛更新的項目url

????????????????new_url = "https://github.com/" + item.group(1) + "zip"

????????????????print("new url = " + new_url)

????????????????flag_list.append(new_url)

????????????????f = open('result.txt', 'a')

????????????????f.write(new_url + '\n')

????????????????f.close()

????????????????flag_page = next_page.group(1)

????????????else:

????????????????print("No update")

????????except urllib.error.HTTPError as err:

????????????print(err.msg)

????????????break

if __name__ == '__main__':

????jiankong_page()

?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容