نحوه برخورد با دادههای گمشده در R
y <- c(1,2,3,NA)
is.na(y)
FALSE FALSE FALSE TRUE
x <- c(1, 2, NA, 3)
mean(x, na.rm=TRUE)
2
x <- c(1,2,NA,3)
na.omit(x)
1 2 3
attr(,"na.action")
3
attr(,"class")
"omit"
df <- data.frame(col1 = c(1:3, NA),
col2 = c("this", NA,"is", "text"),
col3 = c(TRUE, FALSE, TRUE, TRUE),
col4 = c(2.5, 4.2, 3.2, NA),
stringsAsFactors = FALSE)
is.na(df)
col1 col2 col3 col4
[1,] FALSE FALSE FALSE FALSE
[2,] FALSE TRUE FALSE FALSE
[3,] FALSE FALSE FALSE FALSE
[4,] TRUE FALSE FALSE TRUE
is.na(df$col4)
FALSE FALSE FALSE TRUE
which(is.na(x))
3
sum(is.na(df))
3
colSums(is.na(df))
col1 col2 col3 col4
1 1 0 1
x[is.na(x)] <- mean(x, na.rm = TRUE)
df <- data.frame(col1 = c(1:3, 99), col2 = c(2.5, 4.2, 99, 3.2))
df[df == 99] <- NA
df
col1 col2
1 1 2.5
2 2 4.2
3 3 NA
4 NA 3.2
df <- data.frame(col1 = c(1:3, NA),
col2 = c("this", NA,"is", "text"),
col3 = c(TRUE, FALSE, TRUE, TRUE),
col4 = c(2.5, 4.2, 3.2, NA),
stringsAsFactors = FALSE)
df$col4[is.na(df$col4)] <- mean(df$col4, na.rm = TRUE)
df
col1 col2 col3 col4
1 1 this TRUE 2.5
2 2 <NA> FALSE 4.2
3 3 is TRUE 3.2
4 NA text TRUE 3.3
df <- data.frame(col1 = c(1:3, NA),
col2 = c("this", NA,"is", "text"),
col3 = c(TRUE, FALSE, TRUE, TRUE),
col4 = c(2.5, 4.2, 3.2, NA),
stringsAsFactors = FALSE)
na.omit(df)
col1 col2 col3 col4
1 1 this TRUE 2.5
3 3 is TRUE 3.2
x1=c(rnorm(10,0,1),NA,rnorm(5,10,2),NA)
X1
Error: object 'X1' not found
x1
[1] -0.3329234 1.3631137 -0.4691473 0.8428756 -1.4579937 -0.4003059
[7] -0.7764173 -0.3692965 1.2401015 -0.1074338 NA 10.3451870
[13] 10.5092025 8.7709323 7.1415698 9.3380491 NA
x2=c(NA,rnorm(5,1,10),rnorm(5,10,20),rnorm(5,0,0.2),NA)
x2
[1] NA 2.28386063 11.18119992 -1.55573692 -2.02541011
[6] 17.15190683 -5.47426709 18.48004803 -1.67893963 18.30071358
[11] -20.90523313 -0.10374990 -0.05595831 0.20149148 -0.09391399
[16] 0.05957941 NA
model.omit <- lm(x2 ~ x1, na.action = na.omit)
model.omit
Call:
lm(formula = x2 ~ x1, na.action = na.omit)
Coefficients:
(Intercept) x1
6.3210 -0.6923
model.exclude <- lm(x2 ~ x1, na.action = na.exclude)
model.exclude
Call:
lm(formula = x2 ~ x1, na.action = na.exclude)
Coefficients:
(Intercept) x1
6.3210 -0.6923
resid (model.omit)
2 3 4 5 6 7
-3.09354117 4.53538932 -7.29328094 -9.35576370 10.55375267 -12.33278969
8 9 10 12 13 14
11.90336058 -7.14149841 11.90530431 0.73681643 0.89815010 -0.04774241
15 16
-1.47109729 0.20294020
resid(model.exclude)
1 2 3 4 5 6
NA -3.09354117 4.53538932 -7.29328094 -9.35576370 10.55375267
7 8 9 10 11 12
-12.33278969 11.90336058 -7.14149841 11.90530431 NA 0.73681643
13 14 15 16 17
0.89815010 -0.04774241 -1.47109729 0.20294020 NA
fitted(model.omit)
2 3 4 5 6 7 8
5.3774018 6.6458106 5.7375440 7.3303536 6.5981542 6.8585226 6.5766874
9 10 12 13 14 15 16
5.4625588 6.3954093 -0.8405663 -0.9541084 0.2492339 1.3771833 -0.1433608
fitted(model.exclude)
1 2 3 4 5 6 7
NA 5.3774018 6.6458106 5.7375440 7.3303536 6.5981542 6.8585226
8 9 10 11 12 13 14
6.5766874 5.4625588 6.3954093 NA -0.8405663 -0.9541084 0.2492339
15 16 17
1.3771833 -0.1433608 NA
mean(x1)
NA
mean(x1, na.rm = TRUE)
3.042501
summary(x1)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
-1.4580 -0.3848 0.8429 3.0425 7.9563 10.5092 2
table(x1)
x1
-1.45799372254209 -0.776417285336318 -0.469147339576369 -0.40030592004892
1 1 1 1
-0.36929651130101 -0.332923350974514 -0.107433808339981 0.842875632136583
1 1 1 1
1.24010145861679 1.3631137069225 7.14156980799796 8.77093234201115
1 1 1 1
9.33804913076792 10.3451870124484 10.5092025364972
1 1 1
table(x1, useNA = "ifany")
x1
-1.45799372254209 -0.776417285336318 -0.469147339576369 -0.40030592004892
1 1 1 1
-0.36929651130101 -0.332923350974514 -0.107433808339981 0.842875632136583
1 1 1 1
1.24010145861679 1.3631137069225 7.14156980799796 8.77093234201115
1 1 1 1
9.33804913076792 10.3451870124484 10.5092025364972 <NA>
1 1 1 2
sort(x1)
[1] -1.4579937 -0.7764173 -0.4691473 -0.4003059 -0.3692965 -0.3329234
[7] -0.1074338 0.8428756 1.2401015 1.3631137 7.1415698 8.7709323
[13] 9.3380491 10.3451870 10.5092025
length(x1)
17
sort(x1, na.last = TRUE)
[1] -1.4579937 -0.7764173 -0.4691473 -0.4003059 -0.3692965 -0.3329234
[7] -0.1074338 0.8428756 1.2401015 1.3631137 7.1415698 8.7709323
[13] 9.3380491 10.3451870 10.5092025 NA NA