缺失值

导入玩家的玩牌游戏数据
#player <- read.csv("./data/chapter05/玩家玩牌数据.csv",F,na.strings = “NA”)
player <- read.csv("./data/chapter05/玩家玩牌数据.csv",F)
head(player)
str(player)
player_col_names <- c(“用户id”,“性别”,“等级”,“站内好友数”,“经验值”,
“积分”,“登录总次数”,“玩牌局数”,“赢牌局数”,“身上货币量”)

查看变量名
colnames(player) <- player_col_names
colnames(player)
查看前六行
head(player)

利用is.na函数判断“玩牌局数”变量各值是否为缺失值
is.na(player$玩牌局数)

统计缺失值与非缺失值的个数
table(is.na(player$玩牌局数))

sum()和mean()函数来统计缺失值的个数和占比
#计算缺失值个数
sum(is.na(player$玩牌局数))

#计算缺失值占比
mean(is.na(player$玩牌局数))

利用complete.cases函数查看完整实例
sum(complete.cases(player))

用md.pattern函数查看player的缺失值模式
if(!require(mice)) install.packages(“mice”)
md.pattern(player)

–删除缺失样本
sum(!complete.cases(player))
player_full <- na.omit(player)
计算有缺失值的样本个数
sum(!complete.cases(player_full))

–替换缺失值
iris1 <- iris[,c(1,5)]
head(iris1)
table(iris1KaTeX parse error: Expected 'EOF', got '#' at position 10: Species) #̲将40、80、120号样本的S…Sepal.Length,na.rm = T);Sepal.Length.mean
iris1[c(40,80,120),1] <- round(Sepal.Length.mean,1)
iris1[c(40,80,120),1]
查看以前的值和现在的值

iris[c(40,80,120),1];iris1[c(40,80,120),1]

利用同类均值进行赋值的方式来填补缺失值
#将40、80、120号样本的Sepal.Length设置为缺失值
iris2 <- iris[,c(1,5)]
iris2[c(40,80,120),1] <- NA
iris2[40,1] <- round(mean(iris2[iris1 S p e c i e s = = ′ s e t o s a ′ , ′ S e p a l . L e n g t h ′ ] , n a . r m = T ) , 1 ) i r i s 2 [ 80 , 1 ] < − r o u n d ( m e a n ( i r i s 2 [ i r i s 1 Species=='setosa','Sepal.Length'], na.rm = T),1) iris2[80,1] <- round(mean(iris2[iris1 Species==′setosa′,′Sepal.Length′],na.rm=T),1)iris2[80,1]<−round(mean(iris2[iris1Species==‘versicolor’,‘Sepal.Length’],
na.rm = T),1)
iris2[120,1] <- round(mean(iris2[iris1$Species==‘virginica’,‘Sepal.Length’],
na.rm = T),1)
#查看以前的值和现在的值
iris[c(40,80,120),1];iris1[c(40,80,120),1];iris2[c(40,80,120),1]

数据转换：

导入数据
rawdata <- read.csv(“D://小学期/数据转换数据.csv”)
#查看数据的前六行
head(rawdata)
str(rawdata)
#将注册日期变量转换成日期格式
rawdata r e g i s t r a t i o n < − a s . D a t e ( p a s t e ( s u b s t r ( r a w d a t a registration <- as.Date(paste(substr(rawdata registration<−as.Date(paste(substr(rawdataregistration,1,4),
substr(rawdata r e g i s t r a t i o n , 5 , 6 ) , s u b s t r ( r a w d a t a registration,5,6), substr(rawdata registration,5,6),substr(rawdataregistration,7,8),
sep="-"),
“%Y-%m-%d”)
head(rawdata)
str(rawdata)

将首次付费日期转换成日期格式
rawdata f i r s t p a y d a t e < − a s . D a t e ( p a s t e ( s u b s t r ( r a w d a t a firstpaydate <- as.Date(paste(substr(rawdata firstpaydate<−as.Date(paste(substr(rawdatafirstpaydate,1,4),
substr(rawdata f i r s t p a y d a t e , 5 , 6 ) , s u b s t r ( r a w d a t a firstpaydate,5,6), substr(rawdata firstpaydate,5,6),substr(rawdatafirstpaydate,7,8),
sep="-"),
“%Y-%m-%d”)

查看数据的前六行
head(rawdata)
str(rawdata)

#增加ispay变量：0表示非付费用户，1表示付费用户
rawdata i s p a y < − i f e l s e ( ! i s . n a ( r a w d a t a ispay <- ifelse(!is.na(rawdata ispay<−ifelse(!is.na(rawdatafirstpaydate),1,0)
head(rawdata)
增加isnewpay变量：0表示非新增首日付费用户，1表示新增首日付费用户
rawdata i s n e w p a y < − i f e l s e ( r a w d a t a isnewpay <- ifelse(rawdata isnewpay<−ifelse(rawdataregistration==rawdata f i r s t p a y d a t e , 1 , 0 ) h e a d ( r a w d a t a ) r a w d a t a [ i s . n a ( r a w d a t a firstpaydate, 1,0) head(rawdata) rawdata[is.na(rawdata firstpaydate,1,0)head(rawdata)rawdata[is.na(rawdataisnewpay),‘isnewpay’] <- 0
#查看数据前10行
head(rawdata)

#采用(x-mu)/std的标准化方法，与scale()函数效果一样
#install.packages(“caret”)
library(caret)
?preProcess
standard <- preProcess(iris)
head(predict(standard,iris))
head(scale(iris[,1:4]))
#采用(x-min(x))/(max(x)-min(x))的标准化方法
standard <- preProcess(iris, method = ‘range’)
head(predict(standard,iris))
fun <- function(x) (x-min(x))/(max(x)-min(x))
head(sapply(iris[,1:4],fun))

数据分箱：

利用cut函数对数据进行分箱

对days(活跃天数)进行分箱操作
head(rawdata)
rawdata d a y s i n t e r v a l < − c u t ( r a w d a t a days_interval <- cut(rawdata daysinterval<−cut(rawdatadays,
breaks=c(0,30,60,90,Inf),
labels=c(‘一个月内’,‘31_60天’,'6190天’,‘三个月以上’))
head(rawdata)
对lifetime(生命周期)进行分箱操作
rawdata l i f e t i m e i n t e r v a l < − c u t ( r a w d a t a lifetime_interval <- cut(rawdata lifetimeinterval<−cut(rawdatalifetime,
breaks=c(0,7,21,30,90,Inf),
labels=c(‘小于一周’,‘小于三周’,‘小于一个月’,
‘小于三个月’,‘三个月以上’))
查看前六行
head(rawdata)
)

数据标准化变换

#采用(x-mu)/std的标准化方法，与scale()函数效果一样
#install.packages(“caret”)
library(caret)
?preProcess
standard <- preProcess(iris)
head(predict(standard,iris))
head(scale(iris[,1:4]))
#采用(x-min(x))/(max(x)-min(x))的标准化方法
standard <- preProcess(iris, method = ‘range’)
head(predict(standard,iris))
fun <- function(x) (x-min(x))/(max(x)-min(x))
head(sapply(iris[,1:4],fun))

离散数据编码

构建customers数据集
customers<-data.frame(id=c(10,20,30,40,50),
gender=c(“male”,“female”,“female”,“male”,“female”),
mood=c(“happy”,“sad”,“happy”,“sad”,“happy”),
outcome=c(1,1,0,0,0))
customers
#创建新数据框customers.new
customers.new <- customers[,c(‘id’,‘outcome’)]
customers.new
对gender变量进行哑变量处理
customers.new g e n d e r . m a l e < − i f e l s e ( c u s t o m e r s gender.male <- ifelse(customers gender.male<−ifelse(customersgender==‘male’,1,0)
customers.new g e n d e r . f e m a l e < − i f e l s e ( c u s t o m e r s gender.female <- ifelse(customers gender.female<−ifelse(customersgender==‘female’,1,0)
customers.new g e n d e r < − c u s t o m e r s gender <- customers gender<−customersgender
customers.new
对mood变量进行哑变量处理
customers.new m o o d . h a p p y < − i f e l s e ( c u s t o m e r s mood.happy <- ifelse(customers mood.happy<−ifelse(customersmood==‘happy’,1,0)
customers.new m o o d . s a d < − i f e l s e ( c u s t o m e r s mood.sad <- ifelse(customers mood.sad<−ifelse(customersmood==‘sad’,1,0)
customers.new

加载caret包到内存

library(caret)

查看customers的数据结构

str(customers)

利用dummyVars函数对customers数据进行哑变量处理

dmy<-dummyVars(~.,data=customers)

对自身变量进行预测，并转换成data.frame格式

trsf<-data.frame(predict(dmy,newdata=customers))

查看转换结果

trsf

将outcome变量转换成因子型变量

customers o u t c o m e < − a s . f a c t o r ( c u s t o m e r s outcome <- as.factor(customers outcome<−as.factor(customersoutcome)

利用dummyVars函数对customers数据进行哑变量处理

dmy<-dummyVars(~.,data=customers)

对自身变量进行预测，并转换成data.frame格式

trsf<-data.frame(predict(dmy,newdata=customers))

查看转换结果

trsf

只对gender变量进行哑变量转换

dmy.gender <- dummyVars(~gender,data=customers)
trsf.gender <- data.frame(predict(dmy.gender,newdata=customers))
trsf.gender

将levelsOnly和fullRank设置为TRUE

customers<-data.frame(id=c(10,20,30,40,50),
gender=c(“male”,“female”,“female”,“male”,“female”),
mood=c(“happy”,“sad”,“happy”,“sad”,“happy”),
outcome=c(1,1,0,0,0))
dmy<-dummyVars(~.,data=customers,levelsOnly=TRUE,fullRank=TRUE)
trsf<-data.frame(predict(dmy,newdata=customers))
trsf

customers<-data.frame(id=c(10,20,30,40,50),
gender=c(“male”,“female”,“female”,“male”,“female”),
mood=c(“happy”,“sad”,“happy”,“sad”,“happy”),
outcome=c(1,1,0,0,0),
test=c(“1”,“2”,“1”,“3”,“4”))
dmy<-dummyVars(~.,data=customers,levelsOnly=TRUE,fullRank=TRUE)
trsf<-data.frame(predict(dmy,newdata=customers))
trsf