Offensive.py (crawling project history updates)
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import re
import time
import urllib.request
import conf as cf
BASE_URL = 'https://github.com/offensive-security/exploitdb/releases'
DOWNLOAD_LINK_PATTERN = 'href="(.*?)zip" rel="nofollow">'
FIRST_PATTERN = r'</span><a rel="nofollow" href="(.*?)">Next.*'
PAGE_PATTERN = r'>Previous</a><a rel="nofollow" href="(.*?)">Next.*'
class MyCrawler:
def __init__(self, base_url=BASE_URL, start_page="first 1 page"):
self.base_url = base_url
self.start_page = start_page
# self.headers = apache_request_headers();
# Crawling Home Page
def first_page(self):
try:
req = urllib.request.Request(self.base_url)
html = urllib.request.urlopen(req)
doc = html.read().decode('utf8', 'ignore')
next_page = re.search(FIRST_PATTERN, doc, re.M | re.I)
print('Now working on page = {}\n'.format(self.start_page))
time.sleep(5)
self.fetch_download_link(self.base_url)
self.start_page = next_page.group(1)
# re.search(r'after = (.*?) ">Next.*', next_page.group(1), re.M | re.I).group(1)
self.base_url = next_page.group(1)
# self.fetch_download_link(next_url)
except urllib.error.HTTPError as err:
print(err.msg)
self.fetch_next_page()
# Page turning
def fetch_next_page(self):
while True:
try:
req = urllib.request.Request(self.base_url)
html = urllib.request.urlopen(req)
doc = html.read().decode('utf8', 'ignore')
next_page = re.search(PAGE_PATTERN, doc, re.M | re.I)
print('Now working on page {}\n'.format(self.start_page))
time.sleep(5)
#Wait 5 seconds while turning pages
self.fetch_download_link(self.base_url)
self.start_page = next_page.group(1)
# re.search(r'after = (.*?) ">Next.*', next_page.group(1), re.M | re.I).group(1)
self.base_url = next_page.group(1)
# self.fetch_download_link(next_url)
except urllib.error.HTTPError as err:
print(err.msg)
break
# File Download: Save Download Links to Files
def fetch_download_link(self, Aurl):
f = open('result.txt', 'a')
req = urllib.request.Request(Aurl)
html = urllib.request.urlopen(req)
doc = html.read().decode('utf8')
alist = list(set(re.findall(DOWNLOAD_LINK_PATTERN, doc)))
for item in alist:
url = "https://github.com/" + item + "zip"
print('Storing {}'.format(url))
f.write(url + '\n')
time.sleep(7)
f.close()
def run(self):
self.fetch_download_link()
if __name__ == '__main__':
mc = MyCrawler()
mc.first_page()
Tex.py (Monitor Home Page Updates and Crawl)
#!/usr/bin/env python
# -*- coding:utf-8 -*
from selenium import webdriver
import re
import time
import urllib.request
import conf as cf
BASE_URL = 'https://github.com/offensive-security/exploitdb/releases'
DOWNLOAD_LINK_PATTERN = 'href="(.*?)zip" rel="nofollow">'
FIRST_PATTERN = r'</span><a rel="nofollow" href="(.*?)">Next.*'
# Monitoring Project Home Page Update
def jiankong_page():
print("star monitoring ")
req = urllib.request.Request(BASE_URL)
html = urllib.request.urlopen(req)
doc = html.read().decode('utf8', 'ignore')
next_page = re.search(FIRST_PATTERN, doc, re.M | re.I)
flag_page = next_page.group(1)
flag_list = []
# Crawl the url of the homepage item for the first time
alist = list(set(re.findall(DOWNLOAD_LINK_PATTERN, doc)))
for item in alist:
url = "https://github.com/" + item + "zip"
flag_list.append(url)
# Timing Scanning Monitoring (5h/time)
while True:
try:
time.sleep(5 * 60* 60)
req = urllib.request.Request(BASE_URL)
html = urllib.request.urlopen(req)
doc = html.read().decode('utf8', 'ignore')
next_page = re.search(FIRST_PATTERN, doc, re.M | re.I)
# Determine whether pageflip links have changed to determine whether they have been updated
if next_page.group(1) != flag_page:
print("have update")
item = re.rearch(DOWNLOAD_LINK_PATTERN, doc, re.M | re.I)
#Grab the first matching newly updated project url
new_url = "https://github.com/" + item.group(1) + "zip"
print("new url = " + new_url)
flag_list.append(new_url)
f = open('result.txt', 'a')
f.write(new_url + '\n')
f.close()
flag_page = next_page.group(1)
else:
print("No update")
except urllib.error.HTTPError as err:
print(err.msg)
break
if __name__ == '__main__':
jiankong_page()
Introduce myself. I'm Fisher, the author of Internet Security. I share interesting security technologies and stories everyday. Of course, I also record the gains of learning. Interested in the security field, you can pay close attention to my personal public address: austfish. If you don't want to lose it, please pay attention to Fisher's Safety Diary! (Don't forget to add stars) or personal blog: www.austfish.cn