offensive.py(爬取項目歷史更新內(nèi)容)
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import re
import time
import urllib.request
import conf as cf
BASE_URL = 'https://github.com/offensive-security/exploitdb/releases'
DOWNLOAD_LINK_PATTERN = 'href="(.*?)zip" rel="nofollow">'
FIRST_PATTERN = r'</span><a rel="nofollow" href="(.*?)">Next.*'
PAGE_PATTERN = r'>Previous</a><a rel="nofollow" href="(.*?)">Next.*'
class MyCrawler:
????def __init__(self, base_url=BASE_URL, start_page="first 1 page"):
????????self.base_url = base_url
????????self.start_page = start_page
????????# self.headers = apache_request_headers();
# 對首頁的爬取
????def first_page(self):
????????try:
????????????req = urllib.request.Request(self.base_url)
????????????html = urllib.request.urlopen(req)
????????????doc = html.read().decode('utf8', 'ignore')
????????????next_page = re.search(FIRST_PATTERN, doc, re.M | re.I)
????????????print('Now working on page = {}\n'.format(self.start_page))
????????????time.sleep(5)
????????????self.fetch_download_link(self.base_url)
????????????self.start_page = next_page.group(1)
????????????# re.search(r'after = (.*?) ">Next.*', next_page.group(1), re.M | re.I).group(1)
????????????self.base_url = next_page.group(1)
????????????# self.fetch_download_link(next_url)
????????except urllib.error.HTTPError as err:
????????????print(err.msg)
????????self.fetch_next_page()
# 翻頁
????def fetch_next_page(self):
????????while True:
????????????try:
????????????????req = urllib.request.Request(self.base_url)
????????????????html = urllib.request.urlopen(req)
????????????????doc = html.read().decode('utf8', 'ignore')
????????????????next_page = re.search(PAGE_PATTERN, doc, re.M | re.I)
????????????????print('Now working on page {}\n'.format(self.start_page))
????????????????time.sleep(5)
????????????????#翻頁時等待5秒
????????????????self.fetch_download_link(self.base_url)
????????????????self.start_page = next_page.group(1)
????????????????# re.search(r'after = (.*?) ">Next.*', next_page.group(1), re.M | re.I).group(1)
????????????????self.base_url = next_page.group(1)
????????????????# self.fetch_download_link(next_url)
????????????except urllib.error.HTTPError as err:
????????????????print(err.msg)
????????????????break
# 文件下載:將下載鏈接存到文件中
????def fetch_download_link(self, Aurl):
????????f = open('result.txt', 'a')
????????req = urllib.request.Request(Aurl)
????????html = urllib.request.urlopen(req)
????????doc = html.read().decode('utf8')
????????alist = list(set(re.findall(DOWNLOAD_LINK_PATTERN, doc)))
????????for item in alist:
????????????url = "https://github.com/" + item + "zip"
????????????print('Storing {}'.format(url))
????????????f.write(url + '\n')
????????????time.sleep(7)
????????f.close()
????def run(self):
????????self.fetch_download_link()
if __name__ == '__main__':
????mc = MyCrawler()
????mc.first_page()
text.py(監(jiān)控首頁更新,并爬取)
#!/usr/bin/env python
# -*- coding:utf-8 -*
from selenium import webdriver
import re
import time
import urllib.request
import conf as cf
BASE_URL = 'https://github.com/offensive-security/exploitdb/releases'
DOWNLOAD_LINK_PATTERN = 'href="(.*?)zip" rel="nofollow">'
FIRST_PATTERN = r'</span><a rel="nofollow" href="(.*?)">Next.*'
# 監(jiān)控項目首頁更新
def jiankong_page():
????print("star monitoring ")
????req = urllib.request.Request(BASE_URL)
????html = urllib.request.urlopen(req)
????doc = html.read().decode('utf8', 'ignore')
????next_page = re.search(FIRST_PATTERN, doc, re.M | re.I)
????flag_page = next_page.group(1)
????flag_list = []
# 首次抓取首頁項目url
????alist = list(set(re.findall(DOWNLOAD_LINK_PATTERN, doc)))
????for item in alist:
????????url = "https://github.com/" + item + "zip"
????????flag_list.append(url)
# 定時掃描監(jiān)控(5h/次)
????while True:
????????try:
????????????time.sleep(5 * 60* 60)
????????????req = urllib.request.Request(BASE_URL)
????????????html = urllib.request.urlopen(req)
????????????doc = html.read().decode('utf8', 'ignore')
????????????next_page = re.search(FIRST_PATTERN, doc, re.M | re.I)
# 判斷翻頁鏈接是否變化,來確定是否更新
????????????if next_page.group(1) != flag_page:
????????????????print("have update")
????????????????item = re.rearch(DOWNLOAD_LINK_PATTERN, doc, re.M | re.I)
????????????????#抓取第一個匹配的 剛更新的項目url
????????????????new_url = "https://github.com/" + item.group(1) + "zip"
????????????????print("new url = " + new_url)
????????????????flag_list.append(new_url)
????????????????f = open('result.txt', 'a')
????????????????f.write(new_url + '\n')
????????????????f.close()
????????????????flag_page = next_page.group(1)
????????????else:
????????????????print("No update")
????????except urllib.error.HTTPError as err:
????????????print(err.msg)
????????????break
if __name__ == '__main__':
????jiankong_page()