Machine learning notes (2): python module pandas

1. Read csv file data

import pandas as pd
Info = pd.read_csv('titanic_train.csv');
#print(type(Info))   	#Type of Info < class' pandas. Core. Frame. Dataframe '>
#Info.dtypes           #Get the type string type of each field to objeec
#help(type) view function usage

Info.head(10)          #Show top 10 lines
Info.tail(2)              #Show last 2 lines

#Info.shape      # Output (samples (rows, columns) 

2. Value of read csv file data

Info['Name']   #Get the data for this column
Info[['Name','Survived']]  #Get multiple columns of data
Info['Name'][[1,2,3]]    #Name information of lines 1, 2 and 3

Info.loc[2]    #Get the data of the second sample
Info.loc[2:6]   #Get 2-6
Info.loc[[2,5,7]]  #Specify lines of samples
Info.loc[88,"Name"]     #Column Name data at line 88
Info.loc[88:100,"Name"]     #Column Name data for 88-100
Info.loc[[88,100],"Name"]     #Column Name data for lines 88, and 100
Info.loc[88:100,["Name","Survived"]]    #Multiple columns


print(Info['Fare'].max())    #Maximum in this column
print(Info['Fare'].max()) 


#To discard all items with NAN, usually delete the row, use the parameter axis = 0, and delete the parameter axis = 1 of the column. Generally, this is not the case, which will delete a variable.
print(Info.dropna(axis=0));

Info.values   #Get a value of numpy.ndarray
Info.columns   #Output column names for each column


3. Sorting and index setting

#Optional parameters
#Ascending true descending, False ascending
#Axis = 0 (default) is to sort by column. At this time, the first parameter is column name
#axis = 1 sort by row the first parameter is the row number
#Does inplace replace the original data frame
Info.sort_values("Fare",ascending=True,axis=0).head(2)

#If drop is False, the original index column will be restored to normal column and saved. If True, the original index will be lost. The default is False
Info.sort_values("Fare",ascending=True,axis=0).head(10).reset_index(drop=True)
#Set new index
Info.sort_values("Fare",ascending=True,axis=0).head(10).set_index("Fare")


Here is the value to find the column name ending in d

#Here is the value to find the column name ending in d
cols = Info.columns.tolist()    #Column = "lis
P = [];
for i in cols:
    if i.endswith('d'):
         P.append(i)       
Info[P]

How much is a discount of 6% for each ticket

#Info['Fare'] = Info['Fare']*0.6 note that the latitude at both ends of = should be the same (number of lines)

Titanic death statistics

#Average age of rescued

InfoS =Info.query("Survived == 1")
age_is_null  =  pd.isnull(InfoS['Age'])  #Whether the value of Age column is missing
Info_Age = InfoS['Age'][age_is_null == False];  #Get a value where Age column is not empty
Info_Age.sum() / Info_Age.shape[0]    # 28.343689655172415
InfoS['Age'].mean()   #28.343689655172415 can get the same answer without missing value, indicating that mean automatically removes missing value

#Average price per hold
Passenger_classes = [1,2,3];  #3 cabin
Info_P = Info['Pclass'] 
R = [];
for i in Passenger_classes:
    r = Info['Fare'][Info_P == i].mean()
    R.append(r);  
R

apply custom function

#Custom function
def func(col):
    a = col.loc[99];
    return a
#Use custom functions
h = Info.apply(func
Info.apply(func
#Usage function
import math  
Info["Fare"].apply(math.sqrt)


Series

#series is a row or a column of a matrix
from pandas import Series

n= Info["Name"].shape[0]

#Construct a series (value, index)
A = Series(Info["Name"].head(3).values,['c','a','c'])

A['a']
A[['a','b']]

A. Sort? Values()? Value sort a-z ascending

Index = sorted(A.index.tolist()) 񖓿 sort the index
A.reindex(Index) ා sort the old data according to the new index order (the value corresponding to the index remains unchanged)

Posted by acook on Sun, 29 Dec 2019 06:35:14 -0800