Website: http://glidedsky.com
Topic 1: Calculating the sum of all the numbers on the web page
Register in and see
Click in the website to be crawled and find that it's all digital, the first question is really simple, there's nothing to talk about.
Question 2: Same question, 1000 requests
This problem is the same, the simplest is to change the code written above, but this is too slow, you can try to optimize it yourself, add threads or directly use the collaboration, are very good. Of course, I think it should be quicker. No specific tests were done.
Running results, this is a direct change, without adding any threads or coroutines, a little long time. It's very basic, but it depends on how you optimize it.
Should also be able to optimize, I write code lazy, using the co-programming to write.
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2019/8/18 0:33 # @Author : zhao.jia # @Site : # @File : glide_test.py # @Software: PyCharm import requests import tools from lxml import etree import aiohttp import asyncio import datetime import time from requests.adapters import HTTPAdapter class TestGlidedsky: def __init__(self): self.headers = """ Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3 Accept-Encoding: gzip, deflate Accept-Language: zh-CN,zh;q=0.9 Cache-Control: max-age=0 Connection: keep-alive Cookie: _ga=GA1.2.1425271689.1566058842; _gid=GA1.2.586445152.1566058842; Hm_lvt_020fbaad6104bcddd1db12d6b78812f6=1566058842,1566106841; Hm_lpvt_020fbaad6104bcddd1db12d6b78812f6=1566129989; _gat_gtag_UA_75859356_3=1; XSRF-TOKEN=eyJpdiI6IjM4SmpWMlwvaWxPQklreFVaMDFXVFhRPT0iLCJ2YWx1ZSI6IjdoMUFJaVF6YUVvUUNDZU1TaERsN0FVK0dRdTdORW9QUlwvNDlMXC9uXC9IdjdCZ2JCQVhiMXNEV2JKQnI5UXVIMHAiLCJtYWMiOiIyMWMyYzc1MzM3MWQyZTMxNDQwZjA5ZTUxNDZkOThmNTAyOWQwYTQzZDQyZTc4M2Q4YjNlZTI3YjYzZjgwNzA1In0%3D; glidedsky_session=eyJpdiI6Ik1rRUMrXC8yMlVkOEZlSEZja24zdmJRPT0iLCJ2YWx1ZSI6IjRoWG84K1MrM3NLbnlRVytrUVRHd1ZqWWtkdkdyeUtwOTBKdDFWTnl4THdkS1hcL2dmRzA1c1JJRDZSaHk2NlhKIiwibWFjIjoiNmQ2MmJhNWFlNzZiOWEwY2NiMDM1ZTBkZGE2MmNiNGQwNWU4OGJmOTU2OWQxNmU2NmM1MjE1ZmI0NGQ3MjllNyJ9 Host: glidedsky.com Referer: http://glidedsky.com/login Upgrade-Insecure-Requests: 1 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36 """ self.sess = requests.session() self.sess.headers = tools.headers_to_dict(self.headers) self.sum_count_2 = 0 self.sess.mount('http://', HTTPAdapter(max_retries=3)) self.sess.mount('https://', HTTPAdapter(max_retries=3)) self.sess.verify = False def basic_one(self): sum_count = 0 res = self.sess.get(url="http://glidedsky.com/level/web/crawler-basic-1") res_html = etree.HTML(res.text) nums = res_html.xpath('//div[@class="col-md-1"]/text()') for num in nums: sum_count += int(num.strip()) print("sum=" + sum_count) # Second questions def basic_two(self): count = 1 sum_count = 0 while True: res = self.sess.get(f"http://glidedsky.com/level/web/crawler-basic-2?page={count}") res_html = etree.HTML(res.text) nums = res_html.xpath('//div[@class="col-md-1"]/text()') for num in nums: sum_count += int(num.strip()) count += 1 if count == 1001: break print(sum_count) async def basic_two_2(self, url): async with aiohttp.ClientSession() as session: async with session.get(url, headers=tools.headers_to_dict(self.headers)) as resp: res = await resp.text() res_html = etree.HTML(res) nums = res_html.xpath('//div[@class="col-md-1"]/text()') for num in nums: self.sum_count_2 += int(num.strip()) def sum_async_count(self): loop = asyncio.get_event_loop() tasks = [asyncio.ensure_future( self.basic_two_2(f"http://glidedsky.com/level/web/crawler-basic-2?page={i}")) for i in range(1, 500)] loop.run_until_complete(asyncio.gather(*tasks)) tasks = [asyncio.ensure_future( self.basic_two_2(f"http://glidedsky.com/level/web/crawler-basic-2?page={i}")) for i in range(500, 1001)] loop.run_until_complete(asyncio.gather(*tasks)) print(self.sum_count_2) if __name__ == '__main__': # Second questions # starttime = datetime.datetime.now() # TestGlidedsky().basic_two() # endtime = datetime.datetime.now() # count_time_1 = (endtime - starttime).seconds # print(count_time_1) # Second questions # starttime_2 = datetime.datetime.now() # TestGlidedsky().sum_async_count() # endtime_2 = datetime.datetime.now() # count_time_2 = (endtime_2 - starttime_2).seconds # print(count_time_2)
Question 3: Sum or sum.
But this time the ban on ip, each IP can only visit once, this problem is a bit nauseous, can only find proxy ip, find free, try again.
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2019/8/27 11:00 # @Author : Andrew # @Site : # @File : python-abu.py # @Software: PyCharm #! -*- encoding:utf-8 -*- from urllib import request import base64 from lxml import etree import time import requests from requests.adapters import HTTPAdapter class test: def __init__(self): self.sess = requests.session() self.sess.mount('http://', HTTPAdapter(max_retries=3)) self.sess.mount('https://', HTTPAdapter(max_retries=3)) self.sess.verify = False def abu_test(self): # proxy server proxyHost = "proxy.abuyun.com" proxyPort = "9020" # Proxy Tunnel Verification Information proxyUser = "H2T*****22WD" proxyPass = "7****10526D3F" proxy_dict = {'http': "http-dyn.abuyun.com:9020"} auth = f"{proxyUser}:{proxyPass}" auth = base64.b64encode(auth.encode('utf8')) proxy_header = {"Proxy-Authorization": 'Basic ' + auth.decode()} self.get_html(proxy_dict, proxy_header) def get_html(self, proxy_dict, proxy_header): count = 1 sum_count = 0 headers = """ Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3 Accept-Encoding: gzip, deflate Accept-Language: zh-CN,zh;q=0.9 Cache-Control: max-age=0 Cookie: _ga=GA1.2.1251062763.1566609395; Hm_lvt_020fbaad6104bcddd1db12d6b78812f6=1566609396,1566627265; _gid=GA1.2.1809641921.1566875827; _gat_gtag_UA_75859356_3=1; XSRF-TOKEN=eyJpdiI6IkNpMHk0SHlDSXIrWHU4MTBIaW96blE9PSIsInZhbHVlIjoiMXpzXC9GRmZGekxQYW5wcUt0ZU0xQ0l0MWVnNHdKWHo5XC9JNTRnZ0c0UWJlYjZlaDVhU1BNRGxENGNoWjBpdkE0IiwibWFjIjoiYTVjYmJjMzY3OTNiNTJjMDE5MjZhNmEzNDIwNGFmZDYwYzk5Yjg5ZjViYmExMzQwMjVkMTkzNDcyMmJjZmYxMyJ9; glidedsky_session=eyJpdiI6ImJ4aHA3QllGZE9PTlRnbTByZnNNOFE9PSIsInZhbHVlIjoiMGt6bUdqbDBcL2JSRERXbVFyMEdHNDArZmtOTHdQOFRidVlRUTFvMXRWajAzNUlja3gyN3JmV1U1QkVHUHBVU3UiLCJtYWMiOiI0OTY1ZGZmZDgwMTU4YTliNjM0NWVhZTU5MzRhNGQwYmMwM2YzNDc2ZGRkZjVmZDg0ZjQwMGUwODkyNjUwMmY3In0%3D; Hm_lpvt_020fbaad6104bcddd1db12d6b78812f6=1566875832 Host: glidedsky.com Proxy-Connection: keep-alive Referer: http://glidedsky.com/login Upgrade-Insecure-Requests: 1 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36 """ import tools headers = tools.headers_to_dict(headers) headers.update(proxy_header) # print(headers) while True: # if count == 37 or count == 38: # continue try: res = self.sess.get(f"http://glidedsky.com/level/web/crawler-ip-block-1?page={count}", headers=headers, proxies=proxy_dict, timeout=10) except Exception as e: print("abnormal") print(e) continue file_name = f'glidedsky_{count}.html' if res.status_code == 200: with open(file_name, 'w', encoding='utf8') as f: f.write(res.text) res_html = etree.HTML(res.text) nums = res_html.xpath('//div[@class="col-md-1"]/text()') if nums: print("zhaodao") # with open(file_name, 'w', encoding='utf8') as f: # f.write(res.text) for num in nums: sum_count += int(num.strip()) count += 1 print(sum_count) if count == 1001: return sum_count # time.sleep(3) def parse_html(self): count = 1 sum_count = 0 while True: file_name = f'glidedsky_{count}.html' with open(file_name, 'r', encoding='utf8') as f: content = f.read() res_html = etree.HTML(content) nums = res_html.xpath('//div[@class="col-md-1"]/text()') if nums: for num in nums: sum_count += int(num.strip()) print("Frequency synthesis", count, sum_count) if count == 1001: break # return sum_count else: print("No content", file_name) continue count += 1 print("The sum", sum_count) if __name__ == '__main__': # test().abu_test() test().parse_html()
Result:
This article is based on a multi-article platform. ArtiPub Automatic publishing