امیر مسعود ملک فر
امیر مسعود ملک فر
خواندن ۱ دقیقه·۲ سال پیش

داده های گمشده در R

نحوه برخورد با داده‌های گمشده در R

y <- c(1,2,3,NA)

is.na(y)

FALSE FALSE FALSE TRUE

x <- c(1, 2, NA, 3)

mean(x, na.rm=TRUE)

2

x <- c(1,2,NA,3)

na.omit(x)

1 2 3

attr(,"na.action")

3

attr(,"class")

&quotomit&quot

df <- data.frame(col1 = c(1:3, NA),

col2 = c("this", NA,"is", "text"),

col3 = c(TRUE, FALSE, TRUE, TRUE),

col4 = c(2.5, 4.2, 3.2, NA),

stringsAsFactors = FALSE)

is.na(df)

col1 col2 col3 col4
[1,] FALSE FALSE FALSE FALSE
[2,] FALSE TRUE FALSE FALSE
[3,] FALSE FALSE FALSE FALSE
[4,] TRUE FALSE FALSE TRUE

is.na(df$col4)

FALSE FALSE FALSE TRUE

which(is.na(x))

3

sum(is.na(df))

3

colSums(is.na(df))

col1 col2 col3 col4
1 1 0 1

x[is.na(x)] <- mean(x, na.rm = TRUE)

df <- data.frame(col1 = c(1:3, 99), col2 = c(2.5, 4.2, 99, 3.2))

df[df == 99] <- NA

df

col1 col2
1 1 2.5
2 2 4.2
3 3 NA
4 NA 3.2

df <- data.frame(col1 = c(1:3, NA),

col2 = c("this", NA,"is", "text"),

col3 = c(TRUE, FALSE, TRUE, TRUE),

col4 = c(2.5, 4.2, 3.2, NA),

stringsAsFactors = FALSE)

df$col4[is.na(df$col4)] <- mean(df$col4, na.rm = TRUE)

df

col1 col2 col3 col4
1 1 this TRUE 2.5
2 2 <NA> FALSE 4.2
3 3 is TRUE 3.2
4 NA text TRUE 3.3

df <- data.frame(col1 = c(1:3, NA),

col2 = c("this", NA,"is", "text"),

col3 = c(TRUE, FALSE, TRUE, TRUE),

col4 = c(2.5, 4.2, 3.2, NA),

stringsAsFactors = FALSE)

na.omit(df)

col1 col2 col3 col4
1 1 this TRUE 2.5
3 3 is TRUE 3.2

x1=c(rnorm(10,0,1),NA,rnorm(5,10,2),NA)

X1
Error: object 'X1' not found

x1

[1] -0.3329234 1.3631137 -0.4691473 0.8428756 -1.4579937 -0.4003059
[7] -0.7764173 -0.3692965 1.2401015 -0.1074338 NA 10.3451870
[13] 10.5092025 8.7709323 7.1415698 9.3380491 NA

x2=c(NA,rnorm(5,1,10),rnorm(5,10,20),rnorm(5,0,0.2),NA)

x2
[1] NA 2.28386063 11.18119992 -1.55573692 -2.02541011
[6] 17.15190683 -5.47426709 18.48004803 -1.67893963 18.30071358
[11] -20.90523313 -0.10374990 -0.05595831 0.20149148 -0.09391399
[16] 0.05957941 NA

model.omit <- lm(x2 ~ x1, na.action = na.omit)

model.omit

Call:
lm(formula = x2 ~ x1, na.action = na.omit)
Coefficients:
(Intercept) x1
6.3210 -0.6923

model.exclude <- lm(x2 ~ x1, na.action = na.exclude)

model.exclude
Call:
lm(formula = x2 ~ x1, na.action = na.exclude)
Coefficients:
(Intercept) x1
6.3210 -0.6923

resid (model.omit)

2 3 4 5 6 7
-3.09354117 4.53538932 -7.29328094 -9.35576370 10.55375267 -12.33278969
8 9 10 12 13 14
11.90336058 -7.14149841 11.90530431 0.73681643 0.89815010 -0.04774241
15 16
-1.47109729 0.20294020

resid(model.exclude)

1 2 3 4 5 6
NA -3.09354117 4.53538932 -7.29328094 -9.35576370 10.55375267
7 8 9 10 11 12
-12.33278969 11.90336058 -7.14149841 11.90530431 NA 0.73681643
13 14 15 16 17
0.89815010 -0.04774241 -1.47109729 0.20294020 NA

fitted(model.omit)

2 3 4 5 6 7 8
5.3774018 6.6458106 5.7375440 7.3303536 6.5981542 6.8585226 6.5766874
9 10 12 13 14 15 16
5.4625588 6.3954093 -0.8405663 -0.9541084 0.2492339 1.3771833 -0.1433608

fitted(model.exclude)

1 2 3 4 5 6 7
NA 5.3774018 6.6458106 5.7375440 7.3303536 6.5981542 6.8585226
8 9 10 11 12 13 14
6.5766874 5.4625588 6.3954093 NA -0.8405663 -0.9541084 0.2492339
15 16 17
1.3771833 -0.1433608 NA

mean(x1)

NA

mean(x1, na.rm = TRUE)

3.042501

summary(x1)

Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
-1.4580 -0.3848 0.8429 3.0425 7.9563 10.5092 2

table(x1)

x1
-1.45799372254209 -0.776417285336318 -0.469147339576369 -0.40030592004892
1 1 1 1
-0.36929651130101 -0.332923350974514 -0.107433808339981 0.842875632136583
1 1 1 1
1.24010145861679 1.3631137069225 7.14156980799796 8.77093234201115
1 1 1 1
9.33804913076792 10.3451870124484 10.5092025364972
1 1 1

table(x1, useNA = "ifany")

x1
-1.45799372254209 -0.776417285336318 -0.469147339576369 -0.40030592004892
1 1 1 1
-0.36929651130101 -0.332923350974514 -0.107433808339981 0.842875632136583
1 1 1 1
1.24010145861679 1.3631137069225 7.14156980799796 8.77093234201115
1 1 1 1
9.33804913076792 10.3451870124484 10.5092025364972 <NA>
1 1 1 2

sort(x1)

[1] -1.4579937 -0.7764173 -0.4691473 -0.4003059 -0.3692965 -0.3329234
[7] -0.1074338 0.8428756 1.2401015 1.3631137 7.1415698 8.7709323
[13] 9.3380491 10.3451870 10.5092025

length(x1)

17

sort(x1, na.last = TRUE)

[1] -1.4579937 -0.7764173 -0.4691473 -0.4003059 -0.3692965 -0.3329234
[7] -0.1074338 0.8428756 1.2401015 1.3631137 7.1415698 8.7709323
[13] 9.3380491 10.3451870 10.5092025 NA NA












داده گمشدهجایگزینینرم افزار rهوش مصنوعیcol col
شاید از این پست‌ها خوشتان بیاید