ACG Station Reptiles of. net core (2)

Keywords: JSON Javascript Windows Attribute

Follow up on what I said in the previous section Article Although the full code has been released, let's explain the request and analysis process of another page.

PS: We can have a happy chapter on water again, Gu Hey Hey.

Page analysis

Last time I mentioned that the href attribute of the download button was javascript:; the pseudo protocol led to the new open page link carrying the # symbol, but we have solved the problem of the first jump through phantomjs.


Download page

Facts have proved that it is even more ruthless here. There is not even a pseudo-agreement. Nevertheless, we still follow the method of last time, using phantomjs to render the page and link the jumped page in response to the client request returned to us.

Realization

Take Phantomjs as a server, as mentioned in the previous section, and then request it to feed back the crawled results to. net. Note that the response returned to the client here can be either a web page or real data parsed by Phantomjs in HTML.

Net Core code

 public async Task<string> GetDownloadPageAsync(string url)
        {
            string result = string.Empty;
            //Request phantomjs to get the download page
            string dom = "Tappable-inactive animated fadeIn";
            KeyValuePair<string, string> url2dom = new KeyValuePair<string, string>(url, dom);
            var postData = JsonConvert.SerializeObject(url2dom);
            CookieContainer cc = new CookieContainer();  
            HttpHelpers helper = new HttpHelpers();  
            HttpItems items = new HttpItems();
            HttpResults hr = new HttpResults();
            items.Url = this.PostUrl1;
            items.Method = "POST";
            items.Container = cc;
            items.Postdata = postData;
            items.Timeout = 100000;
            hr = await helper.GetHtmlAsync(items);
            var downloadPageUrl = hr.Html;
            Console.WriteLine($"first => { downloadPageUrl }");
            if(downloadPageUrl.Contains("http"))
            {
                //Get Baidu Cloud Download Address and Share Password
                //string code1 = "1";
                dom = "Tappable-inactive btn btn-success btn-block"; // Download links
                url2dom = new KeyValuePair<string, string>(downloadPageUrl, dom);
                postData = JsonConvert.SerializeObject(url2dom);
                items = new HttpItems
                {
                    Url = this.PostUrl2
                };
                items.Method = "POST";
                items.Container = cc;
                items.Postdata = postData;
                items.Timeout = 1000000;
                hr = await helper.GetHtmlAsync(items);
                result = hr.Html; //Return json data
                Console.WriteLine($"second => { result }");
            }
            else
            {
                result = downloadPageUrl; //Output error message
            }
            return result;
        }

This includes the first request to get the download page on the details page, as well as the request to get Baidu cloud links and share passwords on the download page.

JavaScript code

"use strict";
var port = 8089;
var server = require('webserver').create();
 
server.listen(8089, function (request, response) {
    //The incoming parameters need to be changed, currently as follows
    //{"Key":"https://acg12.com/download/#60e21d8417ab60fbfJfcqnT1BC8Qd20PehAIKv3J4ZO%2FJCo0htE9hP5IFZU", 
    //"Value":"Tappable-inactive btn btn-success btn-block"} json character channeling
    //The first parameter is the download page returned after the first request, and the second parameter is the Dom of the download button.
    var data = JSON.parse(request.postRaw);
    var url = data.Key.toString();
    console.log(url);
    var dom = data.Value.toString();
    console.log(dom);
    var code = 200;
    var pwdArray = new Array();
    var result = new Array();
    var page = require('webpage').create();
    page.onInitialized = function() {
      page.customHeaders = {};
    };
    page.settings.loadImages = false;
    page.customHeaders = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36",
        "Referer": url
    };
    response.headers = {
        'Cache': 'no-cache',
        'Content-Type': 'text/plain',
        'Connection': 'Keep-Alive',
        'Keep-Alive': 'timeout=40, max=100'
    };
    //According to Phantomjs'official website, this callback triggers when opening a new tab.
    page.onPageCreated = function(newPage) {
        //console.log('A new child page was created! Its requested URL is not yet available, though.');
        page.onInitialized = function() {
          newPage.customHeaders = {};
        };
        newPage.settings.loadImages = false;
        newPage.customHeaders = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36"
        };
        //newPage.viewportSize = { width: 1920, height: 1080 };
        //Triggered when Baidu Cloud Page is opened and rendered
        newPage.onLoadFinished = function(status) {
            //console.log('A child page is Loaded: ' + newPage.url);
            //newPage.render('newPage.png', {format: 'png', quality: '100'});
            //console.log(pwdArray.length);
            if(pwdArray.length > 0){
                //console.log("enter");
                //Pop password from the array, when no password pop data for null character channeling
                var temp = {"url": newPage.url.toString(), "password": pwdArray.pop().toString()};
                console.log(JSON.stringify(temp));
                result.push(temp); // push the json data into the return result
            }
        };
    };
    page.open(url, function (status) {
        console.log("----" + status);
        if (status !== 'success') {
            code = 400;
            response.write('4XX');
            response.statusCode = code;
            response.close();
        } else {
            code = 200;
            window.setTimeout(function (){
                //var dom = dom;
                pwdArray = page.evaluate(function(dom) {
                    console.log(dom);
                    var pwdArray = new Array();
                    var btnList = document.getElementsByClassName(dom); // Baidu Cloud Link
                    for(var i = 0; i < btnList.length;i ++ ){
                        //Guess all download nodes have passwords
                        var temp = document.getElementById("downloadPwd-" + i);
                        if(temp != undefined){
                            //console.log("****" + temp.value);
                            pwdArray.push(temp.value); // Password push progressive array
                        }else{
                            //console.log("****null");
                            pwdArray.push("null"); // Without a password, push moves into null characters, which corresponds to the url one-to-one.
                        }
                    }
                    for(var i = 0; i < btnList.length;i ++ ){
                        //console.log("click");
                        btnList[i].click(); // Click Download to open the new tab
                    }
                    return pwdArray;
                }, dom);
            }, 6000);
        }
    });
    //Setting up to wait 20 seconds before sending the client's response results ensures that all of the above methods are successfully run.
    window.setTimeout(function(){
        var rs = JSON.stringify(result)
        console.log(rs);
        response.write(rs);
        response.statusCode = code;
        response.close();
    }, 20000);
    page.onConsoleMessage = function(msg, lineNum, sourceId) {
      console.log("$$$$$" + msg);
    };
    page.onError = function(msg, trace) {
       var msgStack = ['PHANTOM ERROR: ' + msg];
       if (trace && trace.length) {
         msgStack.push('TRACE:');
         trace.forEach(function(t) {
           msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line + (t.function ? ' (in function ' + t.function +')' : ''));
         });
       }
       console.log(msgStack.join('\n'));
       phantom.exit(1);
     };
});
phantom.onError = function(msg, trace) {
   var msgStack = ['PHANTOM ERROR: ' + msg];
   if (trace && trace.length) {
     msgStack.push('TRACE:');
     trace.forEach(function(t) {
       msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line + (t.function ? ' (in function ' + t.function +')' : ''));
     });
   }
   console.log(msgStack.join('\n'));
   phantom.exit(1);
 };

The complete source code has been placed in the Github Yes, there's a bat file written in it. Just run.bat directly. Of course, the first section of the environment has been configured. See you next week. You may try it next week. DotnetSpider This is a reference. WebMagic Written. net core crawler framework, interested can first try to play.
Thank you.

Posted by snuggles79 on Fri, 24 May 2019 16:34:15 -0700