Crawler Pass: GlidedSky Foundation Question

Keywords: Python encoding xml Session

Website: http://glidedsky.com

Topic 1: Calculating the sum of all the numbers on the web page

Register in and see

Click in the website to be crawled and find that it's all digital, the first question is really simple, there's nothing to talk about.

Question 2: Same question, 1000 requests

This problem is the same, the simplest is to change the code written above, but this is too slow, you can try to optimize it yourself, add threads or directly use the collaboration, are very good. Of course, I think it should be quicker. No specific tests were done.

Running results, this is a direct change, without adding any threads or coroutines, a little long time. It's very basic, but it depends on how you optimize it.

Should also be able to optimize, I write code lazy, using the co-programming to write.

#!/usr/bin/env python 
# -*- coding: utf-8 -*- 
# @Time : 2019/8/18 0:33 
# @Author : zhao.jia
# @Site :  
# @File : glide_test.py 
# @Software: PyCharm

import requests
import tools
from lxml import etree
import aiohttp
import asyncio
import datetime
import time
from requests.adapters import HTTPAdapter


class TestGlidedsky:

    def __init__(self):

        self.headers = """
            Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
            Accept-Encoding: gzip, deflate
            Accept-Language: zh-CN,zh;q=0.9
            Cache-Control: max-age=0
            Connection: keep-alive
            Cookie: _ga=GA1.2.1425271689.1566058842; _gid=GA1.2.586445152.1566058842; Hm_lvt_020fbaad6104bcddd1db12d6b78812f6=1566058842,1566106841; Hm_lpvt_020fbaad6104bcddd1db12d6b78812f6=1566129989; _gat_gtag_UA_75859356_3=1; XSRF-TOKEN=eyJpdiI6IjM4SmpWMlwvaWxPQklreFVaMDFXVFhRPT0iLCJ2YWx1ZSI6IjdoMUFJaVF6YUVvUUNDZU1TaERsN0FVK0dRdTdORW9QUlwvNDlMXC9uXC9IdjdCZ2JCQVhiMXNEV2JKQnI5UXVIMHAiLCJtYWMiOiIyMWMyYzc1MzM3MWQyZTMxNDQwZjA5ZTUxNDZkOThmNTAyOWQwYTQzZDQyZTc4M2Q4YjNlZTI3YjYzZjgwNzA1In0%3D; glidedsky_session=eyJpdiI6Ik1rRUMrXC8yMlVkOEZlSEZja24zdmJRPT0iLCJ2YWx1ZSI6IjRoWG84K1MrM3NLbnlRVytrUVRHd1ZqWWtkdkdyeUtwOTBKdDFWTnl4THdkS1hcL2dmRzA1c1JJRDZSaHk2NlhKIiwibWFjIjoiNmQ2MmJhNWFlNzZiOWEwY2NiMDM1ZTBkZGE2MmNiNGQwNWU4OGJmOTU2OWQxNmU2NmM1MjE1ZmI0NGQ3MjllNyJ9
            Host: glidedsky.com
            Referer: http://glidedsky.com/login
            Upgrade-Insecure-Requests: 1
            User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36
        """
        self.sess = requests.session()
        self.sess.headers = tools.headers_to_dict(self.headers)
        self.sum_count_2 = 0
        self.sess.mount('http://', HTTPAdapter(max_retries=3))
        self.sess.mount('https://', HTTPAdapter(max_retries=3))
        self.sess.verify = False


    def basic_one(self):
        sum_count = 0
        res = self.sess.get(url="http://glidedsky.com/level/web/crawler-basic-1")
        res_html = etree.HTML(res.text)
        nums = res_html.xpath('//div[@class="col-md-1"]/text()')
        for num in nums:
            sum_count += int(num.strip())
        print("sum=" + sum_count)

    # Second questions
    def basic_two(self):
        count = 1
        sum_count = 0
        while True:
            res = self.sess.get(f"http://glidedsky.com/level/web/crawler-basic-2?page={count}")
            res_html = etree.HTML(res.text)
            nums = res_html.xpath('//div[@class="col-md-1"]/text()')
            for num in nums:
                sum_count += int(num.strip())
            count += 1
            if count == 1001:
                break
        print(sum_count)

    async def basic_two_2(self, url):
        async with aiohttp.ClientSession() as session:
            async with session.get(url, headers=tools.headers_to_dict(self.headers)) as resp:
                res = await resp.text()
                res_html = etree.HTML(res)
                nums = res_html.xpath('//div[@class="col-md-1"]/text()')
                for num in nums:
                    self.sum_count_2 += int(num.strip())

    def sum_async_count(self):
        loop = asyncio.get_event_loop()
        tasks = [asyncio.ensure_future(
            self.basic_two_2(f"http://glidedsky.com/level/web/crawler-basic-2?page={i}")) for i in
                 range(1, 500)]
        loop.run_until_complete(asyncio.gather(*tasks))
        tasks = [asyncio.ensure_future(
            self.basic_two_2(f"http://glidedsky.com/level/web/crawler-basic-2?page={i}")) for i in
            range(500, 1001)]
        loop.run_until_complete(asyncio.gather(*tasks))
        print(self.sum_count_2)


if __name__ == '__main__':
    # Second questions
    # starttime = datetime.datetime.now()
    # TestGlidedsky().basic_two()
    # endtime = datetime.datetime.now()
    # count_time_1 = (endtime - starttime).seconds
    # print(count_time_1)
    # Second questions
    # starttime_2 = datetime.datetime.now()
    # TestGlidedsky().sum_async_count()
    # endtime_2 = datetime.datetime.now()
    # count_time_2 = (endtime_2 - starttime_2).seconds
    # print(count_time_2)

Question 3: Sum or sum.

But this time the ban on ip, each IP can only visit once, this problem is a bit nauseous, can only find proxy ip, find free, try again.

#!/usr/bin/env python 
# -*- coding: utf-8 -*- 
# @Time : 2019/8/27 11:00 
# @Author : Andrew
# @Site :  
# @File : python-abu.py 
# @Software: PyCharm

#! -*- encoding:utf-8 -*-

from urllib import request
import base64
from lxml import etree
import time
import requests
from requests.adapters import HTTPAdapter


class test:

    def __init__(self):
        self.sess = requests.session()
        self.sess.mount('http://', HTTPAdapter(max_retries=3))
        self.sess.mount('https://', HTTPAdapter(max_retries=3))
        self.sess.verify = False

    def abu_test(self):

        # proxy server
        proxyHost = "proxy.abuyun.com"
        proxyPort = "9020"

        # Proxy Tunnel Verification Information
        proxyUser = "H2T*****22WD"
        proxyPass = "7****10526D3F"
        proxy_dict = {'http': "http-dyn.abuyun.com:9020"}
        auth = f"{proxyUser}:{proxyPass}"
        auth = base64.b64encode(auth.encode('utf8'))
        proxy_header = {"Proxy-Authorization": 'Basic ' + auth.decode()}
        self.get_html(proxy_dict, proxy_header)

    def get_html(self, proxy_dict, proxy_header):
        count = 1
        sum_count = 0

        headers = """
                Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
                Accept-Encoding: gzip, deflate
                Accept-Language: zh-CN,zh;q=0.9
                Cache-Control: max-age=0
                Cookie: _ga=GA1.2.1251062763.1566609395; Hm_lvt_020fbaad6104bcddd1db12d6b78812f6=1566609396,1566627265; _gid=GA1.2.1809641921.1566875827; _gat_gtag_UA_75859356_3=1; XSRF-TOKEN=eyJpdiI6IkNpMHk0SHlDSXIrWHU4MTBIaW96blE9PSIsInZhbHVlIjoiMXpzXC9GRmZGekxQYW5wcUt0ZU0xQ0l0MWVnNHdKWHo5XC9JNTRnZ0c0UWJlYjZlaDVhU1BNRGxENGNoWjBpdkE0IiwibWFjIjoiYTVjYmJjMzY3OTNiNTJjMDE5MjZhNmEzNDIwNGFmZDYwYzk5Yjg5ZjViYmExMzQwMjVkMTkzNDcyMmJjZmYxMyJ9; glidedsky_session=eyJpdiI6ImJ4aHA3QllGZE9PTlRnbTByZnNNOFE9PSIsInZhbHVlIjoiMGt6bUdqbDBcL2JSRERXbVFyMEdHNDArZmtOTHdQOFRidVlRUTFvMXRWajAzNUlja3gyN3JmV1U1QkVHUHBVU3UiLCJtYWMiOiI0OTY1ZGZmZDgwMTU4YTliNjM0NWVhZTU5MzRhNGQwYmMwM2YzNDc2ZGRkZjVmZDg0ZjQwMGUwODkyNjUwMmY3In0%3D; Hm_lpvt_020fbaad6104bcddd1db12d6b78812f6=1566875832
                Host: glidedsky.com
                Proxy-Connection: keep-alive
                Referer: http://glidedsky.com/login
                Upgrade-Insecure-Requests: 1
                User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36
                """
        import tools
        headers = tools.headers_to_dict(headers)
        headers.update(proxy_header)
        # print(headers)
        while True:
            # if count == 37 or count == 38:
            #     continue
            try:
                res = self.sess.get(f"http://glidedsky.com/level/web/crawler-ip-block-1?page={count}", headers=headers,
                                   proxies=proxy_dict, timeout=10)
            except Exception as e:
                print("abnormal")
                print(e)
                continue
            file_name = f'glidedsky_{count}.html'
            if res.status_code == 200:
                with open(file_name, 'w', encoding='utf8') as f:
                    f.write(res.text)
                res_html = etree.HTML(res.text)
                nums = res_html.xpath('//div[@class="col-md-1"]/text()')
                if nums:
                    print("zhaodao")
                    # with open(file_name, 'w', encoding='utf8') as f:
                    #     f.write(res.text)
                    for num in nums:
                        sum_count += int(num.strip())
                    count += 1
                    print(sum_count)
                    if count == 1001:
                        return sum_count
            # time.sleep(3)

    def parse_html(self):
        count = 1
        sum_count = 0
        while True:
            file_name = f'glidedsky_{count}.html'
            with open(file_name, 'r', encoding='utf8') as f:
                content = f.read()
            res_html = etree.HTML(content)
            nums = res_html.xpath('//div[@class="col-md-1"]/text()')
            if nums:
                for num in nums:
                    sum_count += int(num.strip())

                print("Frequency synthesis", count, sum_count)
                if count == 1001:
                    break
                    # return sum_count
            else:
                print("No content", file_name)
                continue
            count += 1
        print("The sum", sum_count)


if __name__ == '__main__':
    # test().abu_test()
    test().parse_html()

Result:

This article is based on a multi-article platform. ArtiPub Automatic publishing

Posted by tronicsmasta on Mon, 30 Sep 2019 03:19:17 -0700