Before importing data
Clear demand
Understanding data
Data quality
Import data
Supporting multiple data sources
text file
read.table,read.csv,read.delim
scan
Excel file
csv, prn format + read.csv
Clipboard + read.delim
xlsx extension package
rodbc package
Data cleaning
Missing Value Processing
NA,NaN,Inf,-Inf
Identifying missing data
First, the unreasonable values are re-coded as missing values.
is.na,complete.case
mice::pattern
VIM::aggr,VIM::matrixplot
Dealing with missing values
Does the missing value have business implications?
Delete na.omit, row deletion, paired deletion
replace
Abnormal Value Processing
data conversion
Data Normalization: Scaling the data to a small specific range
Extreme difference
Standardized scale
Normalization by decimal scaling
Improving Distribution Conversion
Take logarithms, squares, reciprocals, exponents
Discretization of numerical variables
ifelse()
Extension of ifelse
Cuts and Hmisc::cut2
Data filtering
Random sampling
sample()
set.seed
Conditional filtration
subset,which
The Gospel of SQL Fans
Need to install the SQL package
install.packages('sqldf')
library(sqldf)
Variable selection
Selection of useful variables
Eliminate the correlation between variables
Note: Data filtering and variable filtering are also known as data protocols.
Derived data
Supporting multiple data sources
text file
write.table,write.csv
# View the current working directory > getwd() [1] "C:/Users/Administrator/Documents" # Modify the current working directory > setwd('G:/work/R') > getwd() [1] "G:/work/R" # Read data sample > x = read.table('G:/work/R/lastsave1.txt',header = F,sep = ',') > x = read.csv('G:/work/R/lastsave1.txt',header = F) > x = read.delim('G:/work/R/lastsave1.txt',header = F,sep = ',') > x = read.delim('clipboard',header = F) //The data read with scan is a list, converted into a data frame using as.data.frame > x = scan('G:/work/R/lastsave1.txt',what = list(date = "",pv = 0,uv = 0),sep = ",") > x = as.data.frame(x) //File writing example: > write.table(x,'G:/work/R/lastsave2.txt',sep = ',',quote = F,col.names = T) //Examples of missing value checking: > (vx = c(1,3,NA,6,8)) [1] 1 3 NA 6 8 > (y1 = is.na(vx)) [1] FALSE FALSE TRUE FALSE FALSE > (y2 = complete.cases(vx)) [1] TRUE TRUE FALSE TRUE TRUE is.na and complete.cases Differences: > (mx = matrix(1:12,3,4)) [,1] [,2] [,3] [,4] [1,] 1 4 7 10 [2,] 2 5 8 11 [3,] 3 6 9 12 > mx[1,2:4] = NA > mx [,1] [,2] [,3] [,4] [1,] 1 NA NA NA [2,] 2 5 8 11 [3,] 3 6 9 12 > (y3 = is.na(mx)) [,1] [,2] [,3] [,4] [1,] FALSE TRUE TRUE TRUE [2,] FALSE FALSE FALSE FALSE [3,] FALSE FALSE FALSE FALSE > (y4 = complete.cases(mx)) [1] FALSE TRUE TRUE //Missing value processing: > vx[y1] =10 > vx [1] 1 3 10 6 8 > mx[y3] = 100 > mx [,1] [,2] [,3] [,4] [1,] 1 100 100 100 [2,] 2 5 8 11 [3,] 3 6 9 12 //Install mice and VIM packages in R > install.packages(c('mice','VIM')) //Loading VIM package > library(VIM) > str(sleep) > summary(sleep) > sum(is.na(sleep$Dream)) > apply(is.na(sleep),2,sum) //Loading mice packages > library(mice) > md.pattern(sleep) > library(VIM) //Graphical representation missing values: > aggr(sleep,prop=F,number=TRUE) # Drawing missing value histogram and sum > matrixplot(sleep) # Red denotes missing values from shallow to deep denoting numerical changes. //Missing Value Processing # Delete data with missing values > newsleep = na.omit(sleep) > str(newsleep) > md.pattern(newsleep) > cor(sleep,use = "pairwise.complete.obs") # pairwise # The missing values can also be filled with median median. > x = c(1,2,NA,100,NA,2,4) > mean(x,na.rm = T) [1] 21.8 > x[is.na(x)] = mean(x,na.rm = T) > x [1] 1.0 2.0 21.8 100.0 21.8 2.0 4.0 # Fill in by column or row correlation //data conversion > str(airquality) > airquality # sort > airquality = airquality[order(airquality$Temp),] > head(airquality,5) Ozone Solar.R Wind Temp Month Day 5 NA NA 14.3 56 5 5 18 6 78 18.4 57 5 18 25 NA 66 16.6 57 5 25 27 NA NA 8.0 57 5 27 15 18 65 13.2 58 5 15 > quantile(airquality$Temp,probs = c(0,0.3,0.6,1.0)) 0% 30% 60% 100% 56 74 81 97 > airquality$isHot = ifelse(airquality$Temp>80,T,F) > head(airquality,5) Ozone Solar.R Wind Temp Month Day isHot 5 NA NA 14.3 56 5 5 FALSE 18 6 78 18.4 57 5 18 FALSE 25 NA 66 16.6 57 5 25 FALSE 27 NA NA 8.0 57 5 27 FALSE 15 18 65 13.2 58 5 15 FALSE > tail(airquality,3) Ozone Solar.R Wind Temp Month Day isHot 123 85 188 6.3 94 8 31 TRUE 122 84 237 6.3 96 8 30 TRUE 120 76 203 9.7 97 8 28 TRUE > airquality = within(airquality,{TempL = NA + TempL[Temp>80] = 'Hot' + TempL[Temp > 70 & Temp <= 80] = 'Warm' + TempL[Temp <= 70] = 'Cold' + }) > head(airquality,5) Ozone Solar.R Wind Temp Month Day isHot TempL 5 NA NA 14.3 56 5 5 FALSE Cold 18 6 78 18.4 57 5 18 FALSE Cold 25 NA 66 16.6 57 5 25 FALSE Cold 27 NA NA 8.0 57 5 27 FALSE Cold 15 18 65 13.2 58 5 15 FALSE Cold > airquality$TempL # Conversion factor type > airquality$TempL = factor(airquality$TempL,levels = c('Cold','Warm','Hot'),ordered = TRUE) > airquality$TempL > unclass(airquality$TempL) > airquality = within(airquality,{ + TempL1 = cut(Temp,breaks = c(56,73,81,97),include.lowest = T) + }) > head(airquality,5) > airquality$TempL1 > airquality = within(airquality,{ + TempL2 = cut(Temp,breaks = quantile(Temp,probs = c(0.0,0.3,0.7,1.0)),include.lowest = T) + }) > table(airquality$TempL2) # How many samples per interval > prop.table(table(airquality$TempL2)) # Proportion of sample per interval # There are cut2 functions in the Hmisc package > install.packages('Hmisc') > library(Hmisc) > airquality = within(airquality,{ + TempL3 = cut2(Temp,g = 4) + }) > head(airquality,5) Ozone Solar.R Wind Temp Month Day isHot TempL TempL1 TempL2 TempL3 5 NA NA 14.3 56 5 5 FALSE Cold [56,73] [56,74] [56,73) 18 6 78 18.4 57 5 18 FALSE Cold [56,73] [56,74] [56,73) 25 NA 66 16.6 57 5 25 FALSE Cold [56,73] [56,74] [56,73) 27 NA NA 8.0 57 5 27 FALSE Cold [56,73] [56,74] [56,73) 15 18 65 13.2 58 5 15 FALSE Cold [56,73] [56,74] [56,73) > as.numeric(airquality$TempL3) [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 [40] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 [79] 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 [118] 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 > unclass(airquality$TempL3) [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 [40] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 [79] 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 [118] 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 attr(,"levels") [1] "[56,73)" "[73,80)" "[80,86)" "[86,97]" //data reduction > subset(airquality,Month %in% c(7,9)) > subset(airquality,Month %in% c(7,9) & Day %in% 10:15) > subset(airquality,Month %in% c(7,9) & Day %in% 10:15 | Temp > 80) > subset(airquality,Month %in% c(7,9) & Day %in% 10:15 | Temp > 80 , select = c(Month,Day,Temp)) > subset(airquality,Month %in% c(7,9) & Day %in% 10:15 | Temp > 80 , select = Temp:Day) > x = sample(1:nrow(airquality),size = 10,replace = T) > x [1] 80 119 4 51 55 114 73 23 117 12 > airSam = airquality[x,] > airSam Ozone Solar.R Wind Temp Month Day isHot TempL TempL1 TempL2 TempL3 116 45 212 9.7 79 8 24 FALSE Warm (73,81] (74,83] [73,80) 97 35 NA 7.4 85 8 5 TRUE Hot (81,97] (83,97] [80,86) 27 NA NA 8.0 57 5 27 FALSE Cold [56,73] [56,74] [56,73) 132 21 230 10.9 75 9 9 FALSE Warm (73,81] (74,83] [73,80) 53 NA 59 1.7 76 6 22 FALSE Warm (73,81] (74,83] [73,80) 129 32 92 15.5 84 9 6 TRUE Hot (81,97] (83,97] [80,86) 131 23 220 10.3 78 9 8 FALSE Warm (73,81] (74,83] [73,80) 28 23 13 12.0 67 5 28 FALSE Cold [56,73] [56,74] [56,73) 81 63 220 11.5 85 7 20 TRUE Hot (81,97] (83,97] [80,86) 4 18 313 11.5 62 5 4 FALSE Cold [56,73] [56,74] [56,73) //Assuming 10 random seeds, we can ensure the consistency of multiple results and verify the previous results. > set.seed(10) > x1 = runif(10,10,100) > x2 = runif(10,10,100) > x1 [1] 55.67304 37.60917 48.42169 72.37919 17.66224 30.28930 34.70775 34.50746 65.42464 [10] 48.67044 > x2 [1] 68.64901 61.09640 20.21581 63.63328 42.22450 48.59285 14.67130 33.77599 45.89117 [10] 85.25207 SQL Statement data filtering > install.packages('sqldf') > library(sqldf) > newdf = sqldf('select * from airquality where Ozone>30') > newdf > newdf = sqldf('select Month,avg(Temp) as avTemp,count(*) as rCnt from airquality where Month in (7,9) group by Month') > newdf Month avTemp rCnt 1 7 83.90323 31 2 9 76.90000 30