R Language Learning Notes _Data Cleaning 1

Keywords: vim SQL Excel

Before importing data
Clear demand
Understanding data
Data quality
Import data
Supporting multiple data sources
text file
read.table,read.csv,read.delim
scan
Excel file
csv, prn format + read.csv
Clipboard + read.delim
xlsx extension package
rodbc package
Data cleaning
Missing Value Processing
NA,NaN,Inf,-Inf
Identifying missing data
First, the unreasonable values are re-coded as missing values.
is.na,complete.case
mice::pattern
VIM::aggr,VIM::matrixplot
Dealing with missing values
Does the missing value have business implications?
Delete na.omit, row deletion, paired deletion
replace
Abnormal Value Processing
data conversion
Data Normalization: Scaling the data to a small specific range
Extreme difference
Standardized scale
Normalization by decimal scaling
Improving Distribution Conversion
Take logarithms, squares, reciprocals, exponents
Discretization of numerical variables
ifelse()
Extension of ifelse
Cuts and Hmisc::cut2
Data filtering
Random sampling
sample()
set.seed
Conditional filtration
subset,which
The Gospel of SQL Fans
Need to install the SQL package
install.packages('sqldf')
library(sqldf)
Variable selection
Selection of useful variables
Eliminate the correlation between variables
Note: Data filtering and variable filtering are also known as data protocols.
Derived data
Supporting multiple data sources
text file
write.table,write.csv

# View the current working directory
> getwd()
[1] "C:/Users/Administrator/Documents"
# Modify the current working directory
> setwd('G:/work/R')
> getwd()
[1] "G:/work/R"
# Read data sample
> x = read.table('G:/work/R/lastsave1.txt',header = F,sep = ',')
> x = read.csv('G:/work/R/lastsave1.txt',header = F)
> x = read.delim('G:/work/R/lastsave1.txt',header = F,sep = ',')
> x = read.delim('clipboard',header = F)

//The data read with scan is a list, converted into a data frame using as.data.frame
> x = scan('G:/work/R/lastsave1.txt',what = list(date = "",pv = 0,uv = 0),sep = ",")
> x = as.data.frame(x)

//File writing example:
> write.table(x,'G:/work/R/lastsave2.txt',sep = ',',quote = F,col.names = T)

//Examples of missing value checking:
> (vx = c(1,3,NA,6,8))
[1]  1  3 NA  6  8
> (y1 = is.na(vx))
[1] FALSE FALSE  TRUE FALSE FALSE
> (y2 = complete.cases(vx))
[1]  TRUE  TRUE FALSE  TRUE  TRUE

is.na and complete.cases Differences:
> (mx = matrix(1:12,3,4))
     [,1] [,2] [,3] [,4]
[1,]    1    4    7   10
[2,]    2    5    8   11
[3,]    3    6    9   12
> mx[1,2:4] = NA
> mx
     [,1] [,2] [,3] [,4]
[1,]    1   NA   NA   NA
[2,]    2    5    8   11
[3,]    3    6    9   12
> (y3 = is.na(mx))
      [,1]  [,2]  [,3]  [,4]
[1,] FALSE  TRUE  TRUE  TRUE
[2,] FALSE FALSE FALSE FALSE
[3,] FALSE FALSE FALSE FALSE
> (y4 = complete.cases(mx))
[1] FALSE  TRUE  TRUE

//Missing value processing:
> vx[y1] =10
> vx
[1]  1  3 10  6  8
> mx[y3] = 100
> mx
     [,1] [,2] [,3] [,4]
[1,]    1  100  100  100
[2,]    2    5    8   11
[3,]    3    6    9   12

//Install mice and VIM packages in R
> install.packages(c('mice','VIM'))

//Loading VIM package
> library(VIM)

> str(sleep)
> summary(sleep)
> sum(is.na(sleep$Dream))
> apply(is.na(sleep),2,sum)

//Loading mice packages
> library(mice)

> md.pattern(sleep)

> library(VIM)
//Graphical representation missing values:
> aggr(sleep,prop=F,number=TRUE)        # Drawing missing value histogram and sum
> matrixplot(sleep)     # Red denotes missing values from shallow to deep denoting numerical changes.

//Missing Value Processing
# Delete data with missing values
> newsleep = na.omit(sleep)
> str(newsleep)
> md.pattern(newsleep)

> cor(sleep,use = "pairwise.complete.obs")  # pairwise

# The missing values can also be filled with median median.
> x = c(1,2,NA,100,NA,2,4)
> mean(x,na.rm = T)
[1] 21.8
> x[is.na(x)] = mean(x,na.rm = T)
> x
[1]   1.0   2.0  21.8 100.0  21.8   2.0   4.0

# Fill in by column or row correlation


//data conversion
> str(airquality)
> airquality
# sort
> airquality = airquality[order(airquality$Temp),]
> head(airquality,5)

   Ozone Solar.R Wind Temp Month Day
5     NA      NA 14.3   56     5   5
18     6      78 18.4   57     5  18
25    NA      66 16.6   57     5  25
27    NA      NA  8.0   57     5  27
15    18      65 13.2   58     5  15
> quantile(airquality$Temp,probs = c(0,0.3,0.6,1.0))
  0%  30%  60% 100%
  56   74   81   97
> airquality$isHot = ifelse(airquality$Temp>80,T,F)
> head(airquality,5)
   Ozone Solar.R Wind Temp Month Day isHot
5     NA      NA 14.3   56     5   5 FALSE
18     6      78 18.4   57     5  18 FALSE
25    NA      66 16.6   57     5  25 FALSE
27    NA      NA  8.0   57     5  27 FALSE
15    18      65 13.2   58     5  15 FALSE
> tail(airquality,3)
    Ozone Solar.R Wind Temp Month Day isHot
123    85     188  6.3   94     8  31  TRUE
122    84     237  6.3   96     8  30  TRUE
120    76     203  9.7   97     8  28  TRUE

> airquality = within(airquality,{TempL = NA
+     TempL[Temp>80] = 'Hot'
+     TempL[Temp > 70 & Temp <= 80] = 'Warm'
+     TempL[Temp <= 70] = 'Cold'
+ })

> head(airquality,5)
   Ozone Solar.R Wind Temp Month Day isHot TempL
5     NA      NA 14.3   56     5   5 FALSE  Cold
18     6      78 18.4   57     5  18 FALSE  Cold
25    NA      66 16.6   57     5  25 FALSE  Cold
27    NA      NA  8.0   57     5  27 FALSE  Cold
15    18      65 13.2   58     5  15 FALSE  Cold
> airquality$TempL

# Conversion factor type
> airquality$TempL = factor(airquality$TempL,levels = c('Cold','Warm','Hot'),ordered = TRUE)
> airquality$TempL
> unclass(airquality$TempL)

> airquality = within(airquality,{
+     TempL1 = cut(Temp,breaks = c(56,73,81,97),include.lowest = T)
+ })

> head(airquality,5)
> airquality$TempL1

> airquality = within(airquality,{
+     TempL2 = cut(Temp,breaks = quantile(Temp,probs = c(0.0,0.3,0.7,1.0)),include.lowest = T)
+ })

> table(airquality$TempL2)      # How many samples per interval
> prop.table(table(airquality$TempL2))      # Proportion of sample per interval

# There are cut2 functions in the Hmisc package
> install.packages('Hmisc')
> library(Hmisc)

> airquality = within(airquality,{
+     TempL3 = cut2(Temp,g = 4)
+ })

> head(airquality,5)
   Ozone Solar.R Wind Temp Month Day isHot TempL  TempL1  TempL2  TempL3
5     NA      NA 14.3   56     5   5 FALSE  Cold [56,73] [56,74] [56,73)
18     6      78 18.4   57     5  18 FALSE  Cold [56,73] [56,74] [56,73)
25    NA      66 16.6   57     5  25 FALSE  Cold [56,73] [56,74] [56,73)
27    NA      NA  8.0   57     5  27 FALSE  Cold [56,73] [56,74] [56,73)
15    18      65 13.2   58     5  15 FALSE  Cold [56,73] [56,74] [56,73)

> as.numeric(airquality$TempL3)
  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [40] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 [79] 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
[118] 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4

> unclass(airquality$TempL3)
  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [40] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 [79] 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
[118] 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
attr(,"levels")
[1] "[56,73)" "[73,80)" "[80,86)" "[86,97]"

//data reduction
> subset(airquality,Month %in% c(7,9))

> subset(airquality,Month %in% c(7,9) & Day %in% 10:15)

> subset(airquality,Month %in% c(7,9) & Day %in% 10:15 | Temp > 80)

> subset(airquality,Month %in% c(7,9) & Day %in% 10:15 | Temp > 80 , select = c(Month,Day,Temp))

> subset(airquality,Month %in% c(7,9) & Day %in% 10:15 | Temp > 80 , select = Temp:Day)

> x = sample(1:nrow(airquality),size = 10,replace = T)
> x
 [1]  80 119   4  51  55 114  73  23 117  12
> airSam = airquality[x,]
> airSam
    Ozone Solar.R Wind Temp Month Day isHot TempL  TempL1  TempL2  TempL3
116    45     212  9.7   79     8  24 FALSE  Warm (73,81] (74,83] [73,80)
97     35      NA  7.4   85     8   5  TRUE   Hot (81,97] (83,97] [80,86)
27     NA      NA  8.0   57     5  27 FALSE  Cold [56,73] [56,74] [56,73)
132    21     230 10.9   75     9   9 FALSE  Warm (73,81] (74,83] [73,80)
53     NA      59  1.7   76     6  22 FALSE  Warm (73,81] (74,83] [73,80)
129    32      92 15.5   84     9   6  TRUE   Hot (81,97] (83,97] [80,86)
131    23     220 10.3   78     9   8 FALSE  Warm (73,81] (74,83] [73,80)
28     23      13 12.0   67     5  28 FALSE  Cold [56,73] [56,74] [56,73)
81     63     220 11.5   85     7  20  TRUE   Hot (81,97] (83,97] [80,86)
4      18     313 11.5   62     5   4 FALSE  Cold [56,73] [56,74] [56,73)

//Assuming 10 random seeds, we can ensure the consistency of multiple results and verify the previous results.
> set.seed(10)
> x1 = runif(10,10,100)
> x2 = runif(10,10,100)
> x1
 [1] 55.67304 37.60917 48.42169 72.37919 17.66224 30.28930 34.70775 34.50746 65.42464
[10] 48.67044
> x2
 [1] 68.64901 61.09640 20.21581 63.63328 42.22450 48.59285 14.67130 33.77599 45.89117
[10] 85.25207

SQL Statement data filtering
> install.packages('sqldf')
> library(sqldf)

> newdf = sqldf('select * from airquality where Ozone>30')
> newdf

> newdf = sqldf('select Month,avg(Temp) as avTemp,count(*) as rCnt from airquality where Month in (7,9) group by Month')
> newdf
  Month   avTemp rCnt
1     7 83.90323   31
2     9 76.90000   30

Posted by boiy on Thu, 04 Apr 2019 09:21:31 -0700