1. Read csv file data
import pandas as pd Info = pd.read_csv('titanic_train.csv'); #print(type(Info)) #Type of Info < class' pandas. Core. Frame. Dataframe '> #Info.dtypes #Get the type string type of each field to objeec #help(type) view function usage Info.head(10) #Show top 10 lines Info.tail(2) #Show last 2 lines #Info.shape # Output (samples (rows, columns)
2. Value of read csv file data
Info['Name'] #Get the data for this column Info[['Name','Survived']] #Get multiple columns of data Info['Name'][[1,2,3]] #Name information of lines 1, 2 and 3 Info.loc[2] #Get the data of the second sample Info.loc[2:6] #Get 2-6 Info.loc[[2,5,7]] #Specify lines of samples Info.loc[88,"Name"] #Column Name data at line 88 Info.loc[88:100,"Name"] #Column Name data for 88-100 Info.loc[[88,100],"Name"] #Column Name data for lines 88, and 100 Info.loc[88:100,["Name","Survived"]] #Multiple columns print(Info['Fare'].max()) #Maximum in this column print(Info['Fare'].max()) #To discard all items with NAN, usually delete the row, use the parameter axis = 0, and delete the parameter axis = 1 of the column. Generally, this is not the case, which will delete a variable. print(Info.dropna(axis=0)); Info.values #Get a value of numpy.ndarray Info.columns #Output column names for each column
3. Sorting and index setting
#Optional parameters #Ascending true descending, False ascending #Axis = 0 (default) is to sort by column. At this time, the first parameter is column name #axis = 1 sort by row the first parameter is the row number #Does inplace replace the original data frame Info.sort_values("Fare",ascending=True,axis=0).head(2) #If drop is False, the original index column will be restored to normal column and saved. If True, the original index will be lost. The default is False Info.sort_values("Fare",ascending=True,axis=0).head(10).reset_index(drop=True) #Set new index Info.sort_values("Fare",ascending=True,axis=0).head(10).set_index("Fare")
Here is the value to find the column name ending in d
#Here is the value to find the column name ending in d cols = Info.columns.tolist() #Column = "lis P = []; for i in cols: if i.endswith('d'): P.append(i) Info[P]
How much is a discount of 6% for each ticket
#Info['Fare'] = Info['Fare']*0.6 note that the latitude at both ends of = should be the same (number of lines)
Titanic death statistics
#Average age of rescued InfoS =Info.query("Survived == 1") age_is_null = pd.isnull(InfoS['Age']) #Whether the value of Age column is missing Info_Age = InfoS['Age'][age_is_null == False]; #Get a value where Age column is not empty Info_Age.sum() / Info_Age.shape[0] # 28.343689655172415 InfoS['Age'].mean() #28.343689655172415 can get the same answer without missing value, indicating that mean automatically removes missing value #Average price per hold Passenger_classes = [1,2,3]; #3 cabin Info_P = Info['Pclass'] R = []; for i in Passenger_classes: r = Info['Fare'][Info_P == i].mean() R.append(r); R
apply custom function
#Custom function def func(col): a = col.loc[99]; return a #Use custom functions h = Info.apply(func Info.apply(func #Usage function import math Info["Fare"].apply(math.sqrt)
Series
#series is a row or a column of a matrix
from pandas import Series
n= Info["Name"].shape[0]
#Construct a series (value, index)
A = Series(Info["Name"].head(3).values,['c','a','c'])
A['a']
A[['a','b']]
A. Sort? Values()? Value sort a-z ascending
Index = sorted(A.index.tolist()) sort the index
A.reindex(Index) ා sort the old data according to the new index order (the value corresponding to the index remains unchanged)