library(mlbench)
data(BostonHousing)
df <- BostonHousing
set.seed(123)
ind.train <- sample(1:nrow(df), size=400, replace=FALSE)
ind.test <- setdiff(1:nrow(df), ind.train)
df$medv <- ifelse(df$medv < 25, 0, 1)
df$medv <- factor(df$medv, levels=c("0", "1"))
y <- df$medv
x_discrete <- df$rad
x_continuous <- infotheo::discretize(df$nox)
infotheo::mutinformation(y, x_discrete)
## [1] 0.09452454
infotheo::mutinformation(y, x_continuous)
## [1] 0.09534981
praznik::CMIM(df[,-14], df$medv, k=10)
## $selection
## rm nox lstat ptratio age indus tax zn rad dis
## 6 5 13 11 7 3 10 2 9 8
##
## $score
## rm nox lstat ptratio age indus tax
## 0.26395414 0.09855933 0.09807178 0.06759851 0.04250372 0.04098204 0.04051790
## zn rad dis
## 0.03725492 0.03248110 0.02487993
więcej można przeczytać tu: https://mlr.mlr-org.com/articles/tutorial/filter_methods.html
task <- mlr::makeClassifTask(data=df, target="medv")
# potrzebne pakiety: FSelectorRcpp, FSelector
fv <- mlr::generateFilterValuesData(task,
method = c("FSelectorRcpp_information.gain", "FSelector_chi.squared"))
mlr::plotFilterValues(fv, filter="FSelectorRcpp_information.gain")
boruta.res <- Boruta::Boruta(medv~., data = df, maxRuns = 20)
print(boruta.res)
## Boruta performed 19 iterations in 3.938433 secs.
## 12 attributes confirmed important: age, b, crim, dis, indus and 7
## more;
## No attributes deemed unimportant.
## 1 tentative attributes left: chas;
Boruta::getSelectedAttributes(boruta.res, withTentative = F)
## [1] "crim" "zn" "indus" "nox" "rm" "age" "dis"
## [8] "rad" "tax" "ptratio" "b" "lstat"
plot(boruta.res)
# mcfs.res <- rmcfs::mcfs(medv~., data=df)
# png("fs.png")
# plot(mcfs.res)
# dev.off()
#
# powiazania <- rmcfs::build.idgraph(mcfs.res)
# png("fs2.png")
# plot(powiazania, label_dist = 1)
# dev.off()
df$chas <- as.numeric(as.character(df$chas))
glm.lasso <- glmnet::glmnet(as.matrix(df[,-14]), df$medv, family="binomial")
plot(glm.lasso)
glm.lasso.lambda <- glmnet::glmnet(as.matrix(df[,-14]), df$medv, family="binomial", lambda=0.01)
glm.lasso.lambda$beta
## 13 x 1 sparse Matrix of class "dgCMatrix"
## s0
## crim .
## zn .
## indus -0.02963303
## chas 0.42042521
## nox .
## rm 1.86524858
## age .
## dis -0.17010009
## rad 0.01540324
## tax .
## ptratio -0.17842433
## b .
## lstat -0.27107394
df.selected <- df[,c(glm.lasso.lambda$beta[,1]!=0, TRUE)]
glm.mod.sel <- glm(medv~., data=df.selected, family=binomial)
summary(glm.mod.sel)
##
## Call:
## glm(formula = medv ~ ., family = binomial, data = df.selected)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.6331 -0.3340 -0.1165 0.0779 3.3407
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.95818 3.55547 -0.551 0.581804
## indus -0.13479 0.04903 -2.749 0.005971 **
## chas 0.76793 0.70072 1.096 0.273116
## rm 1.85489 0.41525 4.467 7.93e-06 ***
## dis -0.37774 0.10337 -3.654 0.000258 ***
## rad 0.11725 0.03766 3.114 0.001848 **
## ptratio -0.29636 0.09849 -3.009 0.002620 **
## lstat -0.40939 0.06497 -6.301 2.95e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 580.85 on 505 degrees of freedom
## Residual deviance: 238.13 on 498 degrees of freedom
## AIC: 254.13
##
## Number of Fisher Scoring iterations: 7
glm.mod <- glm(medv~., data=df, family=binomial)
glm.mod.AIC <- step(glm.mod, trace = 0)
lda.step <- klaR::stepclass(medv~., data=df, method="lda", direction="backward",
criterion="AC", improvement = 0.05, fold=10)
## accuracy: 0.70779; starting variables (13): crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, b, lstat
## accuracy: 0.709; out: "crim"; variables (12): zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, b, lstat
##
## hr.elapsed min.elapsed sec.elapsed
## 0.00 0.00 1.23
plot(lda.step)
sda.mod <- sparseLDA::sda(as.matrix(df[,-14]), df$medv)
sda.mod
##
## Call:
## sda.default(x = as.matrix(df[, -14]), y = df$medv)
##
## lambda = 1e-06
## stop = 13 variables
## classes = 0, 1
##
## Top 5 predictors (out of 13):
## b, age, tax, crim, zn