python crawler: the first step of Ctrip integrated operation

Keywords: SQL Database JSON Windows

After the first two articles, we have almost prepared
Crawling Ctrip ticket information: https://blog.csdn.net/weixin_42109012/article/details/96423081
Get Ctrip City abbreviation: https://blog.csdn.net/weixin_42109012/article/details/96378400
Now we need to change our copy request payload to set request payload.

1. Request load analysis

The flight way is not easy to change, so I didn't study it in depth.
airportParams is the key of flight request. We only want to enter the start point, end point and time.
So we need to associate the name, abbreviation and number of the city.

2. Extract information from database

It's very simple. It's not difficult. It's in list format.

def select(sql):
    db = pymysql.connect(
        host="localhost",
        port=3308,
        user="root",
        password="123456",
        database="Ctrip"
    )
    cur = db.cursor()
    try:
        # Execute sql statement
        cur.execute(sql)
        results = cur.fetchall()
        # Return result
        return results
    except Exception as e:
        raise e
    finally:
        db.close()

if __name__ == "__main__":
    sql = "select * from city where CN='Mianyang'"
    print(select(sql))
    print(select(sql)[0])
    print(select(sql)[0][0])

3. Modify the requested load

dcityname = input("Please enter the destination:")
    acityname = input("Please enter a destination:")
    date = input("Please enter the time (Format: 2008-08-08): ")
    sql1 = "select * from city where CN='%s'" % dcityname
    sql2 = "select * from city where CN='%s'" % acityname
    dcity = select(sql1)[0][2]
    dcityid = select(sql1)[0][3]
    acity = select(sql2)[0][2]
    acityid = select(sql2)[0][3]
    request_payload = {
        "flightWay": "Oneway",
        "classType": "ALL",
        "hasChild": False,
        "hasBaby": False,
        "searchIndex": 1,
        "airportParams": [
            {"dcity": dcity, "acity": acity, "dcityname": dcityname, "acityname": acityname, "date": date, "dcityid": dcityid, "acityid": acityid}
            # {"dcity": "SHA", "acity": "BJS", "dcityname": "Shanghai", "acityname": "Beijing", "date": "2019-07-23", "dcityid": 2, "acityid": 1}
        ]
    }

Whole code

import requests
import json
import pymysql


def select(sql):
    db = pymysql.connect(
        host="localhost",
        port=3308,
        user="root",
        password="123456",
        database="Ctrip"
    )
    cur = db.cursor()
    try:
        # Execute sql statement
        cur.execute(sql)
        results = cur.fetchall()
        # Return result
        return results
    except Exception as e:
        raise e
    finally:
        db.close()


def FlightInfo():
    url = "https://flights.ctrip.com/itinerary/api/12808/products"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0",
        "Referer": "https://flights.ctrip.com",
        "Content-Type": "application/json"
    }
    dcityname = input("Please enter the destination:")
    acityname = input("Please enter a destination:")
    date = input("Please enter the time (Format: 2008-08-08): ")
    sql1 = "select * from city where CN='%s'" % dcityname
    sql2 = "select * from city where CN='%s'" % acityname
    dcity = select(sql1)[0][2]
    dcityid = select(sql1)[0][3]
    acity = select(sql2)[0][2]
    acityid = select(sql2)[0][3]
    request_payload = {
        "flightWay": "Oneway",
        "classType": "ALL",
        "hasChild": False,
        "hasBaby": False,
        "searchIndex": 1,
        "airportParams": [
            {"dcity": dcity, "acity": acity, "dcityname": dcityname, "acityname": acityname, "date": date, "dcityid": dcityid, "acityid": acityid}
            # {"dcity": "SHA", "acity": "BJS", "dcityname": "Shanghai", "acityname": "Beijing", "date": "2019-07-23", "dcityid": 2, "acityid": 1}
        ]
    }

    # post request
    response = requests.post(url, data=json.dumps(request_payload), headers=headers).text
    # print(response)
    # A lot of flight information is divided here
    routeList = json.loads(response).get('data').get('routeList')
    # print(routeList)
    # Read each message in turn
    if routeList is not None:
        for route in routeList:
            # Judge whether there is information and only take direct flight
            if len(route.get('legs')) == 1:
                legs = route.get('legs')
                flight = legs[0].get('flight')
                # Extract the information you want
                airlineName = flight.get('airlineName')
                flightNumber = flight.get('flightNumber')
                departureDate = flight.get('departureDate')
                arrivalDate = flight.get('arrivalDate')
                departureCityName = flight.get('departureAirportInfo').get('cityName')
                departureAirportName = flight.get('departureAirportInfo').get('airportName')
                arrivalCityName = flight.get('arrivalAirportInfo').get('cityName')
                arrivalAirportName = flight.get('arrivalAirportInfo').get('airportName')

                print(departureCityName, departureAirportName, "\t",
                      arrivalCityName, arrivalAirportName, "\t",
                      departureDate, arrivalDate, "\t",
                      airlineName, flightNumber)
    else:
        print("No flights!")


if __name__ == "__main__":
    FlightInfo()

Effect

summary

I'm thinking about the difference between doing this and searching directly on the web, just as well!!
Finally, I think that in fact, we can input information in batches through database number (266 cities in total) and two for loops to judge that it is not the same city. When several threads are opened (several cores are opened), all information of the day can be extracted only by inputting time.

date = input("Please enter the time (Format: 2008-08-08): ")
for i in range(1, 267):
     for j in range(1, 267):
         if i == j:
             j += 1
         else:
             sql1 = "select * from city where id=%d" % i
             sql2 = "select * from city where id=%d" % j
             dcityname = select(sql1)[0][1]
             dcity = select(sql1)[0][2]
             dcityid = select(sql1)[0][3]
             acityname = select(sql2)[0][1]
             acity = select(sql2)[0][2]
             acityid = select(sql2)[0][3]
             request_payload = {
                 "flightWay": "Oneway",
                 "classType": "ALL",
                 "hasChild": False,
                 "hasBaby": False,
                 "searchIndex": 1,
                 "airportParams": [
                     {"dcity": dcity, "acity": acity, "dcityname": dcityname, "acityname": acityname, "date": date,
                      "dcityid": dcityid, "acityid": acityid}
                     # {"dcity": "SHA", "acity": "BJS", "dcityname": "Shanghai", "acityname": "Beijing", "date": "2019-07-23", "dcityid": 2, "acityid": 1}
                 ]
             }

Posted by hip_hop_x on Fri, 18 Oct 2019 07:58:04 -0700