js anti crawl learning Google Image

Keywords: Python

1. url: https://ac.scmor.com/

2. target: the following link

 

3. Process analysis:

3.1 open chrome debugging for element analysis. Position a "visit now"

 

3.2 the link is not directly linked to the source code, but calls a js function called "visit". The next step is to find this function.

3.3 search resources globally and find the visit function:

 

3.4 see that it also calls a strdecode function, and then go to:

 

3.5 see that it also calls a base64decode function, and then look for:

 

The variables above are also required.

3.6 next, put all the used js code into a js file, and refactor the js code a little.

js code is as follows:

//var url = strdecode(url);
var Gword = "author: link@scmor.com.";
var hn = 'ac.scmor.com'

function strdecode(string) {
    string = base64decode(string);
    key = Gword + hn;
    len = key.length;
    code = '';
    for (i = 0; i < string.length; i++) {
        var k = i % len;
        code += String.fromCharCode(string.charCodeAt(i) ^ key.charCodeAt(k));
    }
    return base64decode(code);
}

function base64decode(str) {
    var c1, c2, c3, c4;
    var i, len, out;
    len = str.length;
    i = 0;
    out = "";
    while (i < len) {
        do {
            c1 = base64DecodeChars[str.charCodeAt(i++) & 0xff];
        } while (i < len && c1 == -1);if (c1 == -1)
            break;
        do {
            c2 = base64DecodeChars[str.charCodeAt(i++) & 0xff];
        } while (i < len && c2 == -1);if (c2 == -1)
            break;
        out += String.fromCharCode((c1 << 2) | ((c2 & 0x30) >> 4));
        do {
            c3 = str.charCodeAt(i++) & 0xff;
            if (c3 == 61)
                return out;
            c3 = base64DecodeChars[c3];
        } while (i < len && c3 == -1);if (c3 == -1)
            break;
        out += String.fromCharCode(((c2 & 0XF) << 4) | ((c3 & 0x3C) >> 2));
        do {
            c4 = str.charCodeAt(i++) & 0xff;
            if (c4 == 61)
                return out;
            c4 = base64DecodeChars[c4];
        } while (i < len && c4 == -1);if (c4 == -1)
            break;
        out += String.fromCharCode(((c3 & 0x03) << 6) | c4);
    }
    return out;
}

var base64DecodeChars = new Array(-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,62,-1,-1,-1,63,52,53,54,55,56,57,58,59,60,61,-1,-1,-1,-1,-1,-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,-1,-1,-1,-1,-1,-1,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,-1,-1,-1,-1,-1);

There are two variables that need to be found in the source code of the web page, Gword and hn. After finding them, define the variables in the js code, as above.

3.7 use execjs to execute js and get the "now visit" link through js. The complete code is as follows:

import execjs

def getJs():
    jsStr = ''
    with open('jsCode.js', 'r') as f:
        s = f.readline()
        while s:
            jsStr += s
            s = f.readline()
    # print(jsStr)
    return jsStr


if __name__ == '__main__':
    jsStr = getJs()
    ctx = execjs.compile(jsStr)
    visitParam = 'AD0mWAw6dxYgEFdYJEAAGCA2bFcLOngbAmYmFjRdS1ovGFBc'
    url = ctx.call('strdecode', visitParam)
    print(url)

As a crawler, my js is really weak.

Posted by tkreinbring on Sat, 21 Dec 2019 06:41:06 -0800