# --------------------------------------------------------
# ST4061 / ST6041
# 2021-2022
# Eric Wolsztynski
# ...
#### Section 5: SVM Support Vector Machine ####
# --------------------------------------------------------
rm(list=ls())? # clear out running environment
library(randomForest)
library(class)
library(pROC)
library(e1071)
#SVM:一種復(fù)雜的分類工具
#### Example using SVM on the "best-student" data ####
# simulate data:生成隨機(jī)數(shù)
set.seed(1)
n = 100
mark = rnorm(n, m=50, sd=10)
choc = rnorm(n, m=60, sd=5)
summary(mark)
summary(choc)
#評(píng)分標(biāo)準(zhǔn)--f(x) separating hyperplane: f(x)=int+a*mark + b*choc
int = 10
a = 2
b = 4
# rating of students on basis of their marks and level of appreciation of chocolate:
# 根據(jù)學(xué)生的分?jǐn)?shù)和對(duì)巧克力的欣賞程度給他們打分
#生成data set:
mod = int + a*mark + b*choc? # values for true model,生成separating hyperplane,該線/平面上是理論值
z = rnorm(n,s=8)? # additive noise
obs = mod + z #圍繞separating hyperplane生成一系列實(shí)際觀測(cè)值
#分類:
y = as.factor(ifelse(obs>350,1,0))? # classification data,y記錄了每個(gè)實(shí)際觀測(cè)值所對(duì)應(yīng)的真實(shí)分類
plot(mod, obs, xlab="model", ylab="observations", pch=20, cex=2)
plot(mark, choc, xlab="x1 (mark)", ylab="x2 (choc)",
? ? pch=20, col=c(2,4)[as.numeric(y)], cex=2)
legend("topright", box.col=8, bty='n',
? ? ? legend=c("good student","better student"),
? ? ? pch=15, col=c(2,4))
table(y)#可以看到how complicated the data set is to be classify
set.seed(1)
# split the data into train+test (50%-50%):
# 生成的dataset是由觀測(cè)值mark&choc組成的x,對(duì)應(yīng)分類組成的y構(gòu)成的
x = data.frame(mark,choc)
i.train = sample(1:n, 50)
x.train = x[i.train,]
x.test = x[-i.train,]
y.train = y[i.train]
y.test = y[-i.train]
class(x.train)
class(y.train)
xm = as.matrix(x.train)
# fit an SVM as follows:
# ?svm # in e1071
set.seed(1)
svmo = svm(xm, y.train, kernel='polynomial') #kernel參數(shù)可換,根據(jù)plot圖像看一下大概是什么分割線
#把x和y輸入,通過肉眼觀察,假定kernel是多項(xiàng)式(曲線)的
#直線linear、曲線polynomial、圓radial basis
svmo
names(svmo)
# cf. ?svm:
# "Parameters of SVM-models usually must be tuned to yield sensible results!"
# 支持向量機(jī)模型的參數(shù)通常必須調(diào)整以產(chǎn)生合理的結(jié)果
#為了找到最合理的svm參數(shù)(find the maximum-margin hyperplane,離margin近的點(diǎn)才對(duì)其有影響)
# one can tune the model as follows:在指定范圍ranges&gamma內(nèi)找到最合適的Parameters
set.seed(1)
svm.tune = e1071::tune(svm, train.x=x.train, train.y=y.train,
? ? ? ? ? ? ? ? ? ? ? kernel='polynomial',
? ? ? ? ? ? ? ? ? ? ? ranges=list(cost=10^(-2:4), gamma=c(0.25,0.5,1,1.5,2)))
? ? ? ? ? ? ? ? ? ? # 這里列出一系列想要嘗試的調(diào)節(jié)參數(shù)
svm.tune
svm.tune$best.parameters#最好的
# then fit final SVM for optimal parameters:測(cè)試樣本
svmo.final = svm(xm, y.train, kernel='polynomial',
? ? ? ? ? ? ? ? gamma=svm.tune$best.parameters$gamma,
? ? ? ? ? ? ? ? cost=svm.tune$best.parameters$cost)
# corresponding confusion matrices:混淆矩陣看分的怎么樣
table(svmo$fitted,y.train)
table(svmo.final$fitted,y.train)
# we can also use caret for easy comparison:直接看accuracy
library(caret)
caret::confusionMatrix(svmo$fitted,y.train)$overall[1]
caret::confusionMatrix(svmo.final$fitted,y.train)$overall[1]
# assessing model fit to training data評(píng)估模型是否適合訓(xùn)練數(shù)據(jù)
identical(fitted(svmo), svmo$fitted)#TRUE——got the right information here
# to identify support vectors: 能左右hyperplane的點(diǎn)
# either svmo$index (indices), or svmo$SV (coordinates)
length(svmo$index)#tuned前的
length(svmo.final$index)#tuned后的,tuned后能左右hyperplane的點(diǎn)變少了,說明分得更干凈了,svm擬合的結(jié)果更好了
# visualize:
plot(x.train, pch=20, col=c(1,2)[y.train], cex=2)
points(svmo$SV, pch=14, col=4, cex=2) # explain why this does not work!?
# apply scaling to dataset to see SV's:一定要先歸一化,才能描出用到的點(diǎn)(用于找到最好參數(shù)的離margin近的點(diǎn),能左右hyperplane的點(diǎn))
plot(apply(x.train,2,scale), pch=20, col=c(1,2)[y.train], cex=2)
points(svmo$SV, pch=14, col=4, cex=2)
points(svmo.final$SV, pch=5, col=3, cex=2)
# If you want to use predict(), use a formula-type
# expression when calling svm(). Because of this,
# we re-shape our dataset:
#如果你想使用predict(),在調(diào)用svm()時(shí)使用公式類型的表達(dá)式。因此,我們重新塑造我們的數(shù)據(jù)集:
dat.train = data.frame(x=x.train, y=y.train)
dat.test = data.frame(x=x.test)
# decision boundary visualization:
svmo = svm(y~., data=dat.train)
plot(svmo, dat.train,
? ? svSymbol = 15, dataSymbol = 'o',
? ? col=c('cyan','pink')) # this is plot.svm()
svmo.final = svm(y~., data=dat.train, kernel='polynomial',
? ? ? ? ? ? ? ? gamma=svm.tune$best.parameters$gamma,
? ? ? ? ? ? ? ? cost=svm.tune$best.parameters$cost) #最好的擬合
plot(svmo.final, dat.train,
? ? svSymbol = 15, dataSymbol = 'o',
? ? col=c('cyan','pink')) # this is plot.svm()
# How to generate predictions from SVM fit:
# fitting the SVM model:
svmo = svm(y~., data=dat.train,
? ? ? ? ? kernel='polynomial',
? ? ? ? ? gamma=svm.tune$best.parameters$gamma,
? ? ? ? ? cost=svm.tune$best.parameters$cost)
# Note that if we need probabilities P(Y=1)
# (for example to calculate ROC+AUC),
# we need to set 'probability=TRUE' also in
# fitting the SVM model:
svmo = svm(y~., data=dat.train, probability=TRUE,
? ? ? ? ? kernel='polynomial',
? ? ? ? ? gamma=svm.tune$best.parameters$gamma,
? ? ? ? ? cost=svm.tune$best.parameters$cost)
#Generate predictions from SVM fit:
svmp = predict(svmo, newdata=dat.test, probability=TRUE)
roc.svm = roc(response=y.test, predictor=attributes(svmp)$probabilities[,2])
roc.svm$auc#越接近1越好
plot(roc.svm)#very happy
# compare with RF:
rf = randomForest(y~., data=dat.train)
rfp = predict(rf, dat.test, type='prob')
roc.rf = roc(y.test, rfp[,2])
roc.rf$auc
plot(roc.svm)
par(new=TRUE)
plot(roc.rf, col='yellow')
legend("bottomright", bty='n',
? ? ? legend=c("RF","SVM"),
? ? ? lty=1, lwd=3, col=c('yellow',1))
#random forest 比不過svm
--------------------------------------------------------
? # ST4061 / ST6041
? # 2021-2022
? # Eric Wolsztynski
? # ...
? ##### Section 5: demo code for effect of kernel on SVM ####
? # Here we simulate 2D data that have a circular spatial
? # distribution, to see the effect of the choice of kernel
? # shape on decision boundaries
? # --------------------------------------------------------
rm(list=ls())
library(e1071)
# Simulate circular data...
# simulate 2-class circular data:
set.seed(4061)
n = 100
S1 = 15; S2 = 3
x1 = c(rnorm(60, m=0, sd=S1), rnorm(40, m=0, sd=S2))
x2 = c(rnorm(60, m=0, sd=S1), rnorm(40, m=0, sd=S2))
# corresponding 2D circular radii:
rads = sqrt(x1^2+x2^2)
# make up the 2 classes in terms of whether lower or greater than median radius:
c1 = which(rads<median(rads))
c2 = c(1:n)[-c1]
# now we apply scaling factors to further separate the
# 2 classes:
x1[c1] = x1[c1]/1.2
x2[c1] = x2[c1]/1.2
x1[c2] = x1[c2]*1.2
x2[c2] = x2[c2]*1.2
# label data according to class membership:
lab = rep(1,n)
lab[c2] = 2#lab里,c1對(duì)應(yīng)的位置為1;c2對(duì)應(yīng)的位置為2
par(mfrow=c(1,1))
plot(x1,x2,col=c(1,2)[lab], pch=c(15,20)[lab], cex=1.5)
# create final data frame:
x = data.frame(x1,x2)
y = as.factor(lab)
dat = cbind(x,y)
# apply SVMs with different choices of kernel shapes:
svmo.lin = svm(y~., data=dat, kernel='linear')
svmo.pol = svm(y~., data=dat, kernel='polynomial')
svmo.rad = svm(y~., data=dat, kernel='radial')
svmo.sig = svm(y~., data=dat, kernel='sigmoid')
plot(svmo.lin, dat, col=c("cyan","pink"), svSymbol=15)
plot(svmo.pol, dat, col=c("cyan","pink"), svSymbol=15)
plot(svmo.rad, dat, col=c("cyan","pink"), svSymbol=15)
plot(svmo.sig, dat, col=c("cyan","pink"), svSymbol=15)
#### NOTE: the code below is outside the scope of this course! 注意:下面的代碼超出了本課程的范圍!
#### It is used here for illustrations purposes only.
# this call format is easier when using predict():
svmo.lin = svm(x, y, kernel='linear', scale=F)
svmo.pol = svm(x, y, kernel='polynomial', scale=F)
svmo.rad = svm(x, y, kernel='radial', scale=F)
svmo.sig = svm(x, y, kernel='sigmoid', scale=F)
# evaluate the SVM boundaries on a regular 2D grid of points:
ng? ? = 50
xrg? = apply(x, 2, range)
x1g? = seq(xrg[1,1], xrg[2,1], length=ng)
x2g? = seq(xrg[1,2], xrg[2,2], length=ng)
xgrid = expand.grid(x1g, x2g)
plot(x, col=c(1,2)[y], pch=20)
abline(v=x1g, col=8, lty=1)
abline(h=x2g, col=8, lty=1)
#
ygrid.lin = predict(svmo.lin, xgrid)
ygrid.pol = predict(svmo.pol, xgrid)
ygrid.rad = predict(svmo.rad, xgrid)
ygrid.sig = predict(svmo.sig, xgrid)
par(mfrow=c(2,2), font.lab=2, font.axis=2)
CEX = .5
COLS = c(1,3)
DCOLS = c(2,4)
#
plot(xgrid, pch=20, col=COLS[as.numeric(ygrid.lin)], cex=CEX, main="Linear kernel")
points(x, col=DCOLS[as.numeric(y)], pch=20)
# points(x[svmo.lin$index,], pch=21, cex=2)
points(svmo.lin$SV, pch=21, cex=2) # same as previous line!
#
plot(xgrid, pch=20, col=COLS[as.numeric(ygrid.pol)], cex=CEX, main="Polynomial kernel")
points(x, col=DCOLS[as.numeric(y)], pch=20)
points(x[svmo.pol$index,], pch=21, cex=2)
#
plot(xgrid, pch=20, col=COLS[as.numeric(ygrid.rad)], cex=CEX, main="Radial kernel")
points(x, col=DCOLS[as.numeric(y)], pch=20)
points(x[svmo.rad$index,], pch=21, cex=2)
#
plot(xgrid, pch=20, col=COLS[as.numeric(ygrid.sig)], cex=CEX, main="Sigmoid kernel")
points(x, col=DCOLS[as.numeric(y)], pch=20)
points(x[svmo.sig$index,], pch=21, cex=2)
# Alternative plot:
par(mfrow=c(2,2), font.lab=2, font.axis=2)
CEX = .5
COLS = c(1,3)
DCOLS = c(2,4)
#
L1 = length(x1g)
L2 = length(x2g)
#
plot(xgrid, pch=20, col=COLS[as.numeric(ygrid.lin)], cex=CEX, main="Linear kernel")
bnds = attributes(predict(svmo.lin, xgrid, decision.values=TRUE))$decision
contour(x1g, x2g, matrix(bnds, L1, L2), level=0, add=TRUE, lwd=2)
#
plot(xgrid, pch=20, col=COLS[as.numeric(ygrid.pol)], cex=CEX, main="Polynomial kernel")
bnds = attributes(predict(svmo.pol, xgrid, decision.values=TRUE))$decision
contour(x1g, x2g, matrix(bnds, L1, L2), level=0, add=TRUE, lwd=2)
#
plot(xgrid, pch=20, col=COLS[as.numeric(ygrid.rad)], cex=CEX, main="Radial kernel")
bnds = attributes(predict(svmo.rad, xgrid, decision.values=TRUE))$decision
contour(x1g, x2g, matrix(bnds, L1, L2), level=0, add=TRUE, lwd=2)
#
plot(xgrid, pch=20, col=COLS[as.numeric(ygrid.sig)], cex=CEX, main="Sigmoid kernel")
bnds = attributes(predict(svmo.sig, xgrid, decision.values=TRUE))$decision
contour(x1g, x2g, matrix(bnds, L1, L2), level=0, add=TRUE, lwd=2)
# NB: naive Bayes decision boundary is obtained with
# contour(x1g, x2g, matrix(bnds, L1, L2), level=0.5, add=TRUE, col=4, lwd=2)
# --------------------------------------------------------
# ST4061 / ST6041
# 2021-2022
# Eric Wolsztynski
# ...
#### Exercises Section 5: SVM ####
# --------------------------------------------------------
rm(list=ls())
library(randomForest)
library(class)
library(pROC)
library(e1071)
library(caret)
library(ISLR)
###############################################################
#### Exercise 1: using SVM to classify (High Carseat sales dataset) ####
###############################################################
library(ISLR) # contains the dataset
# Recode response variable so as to make it a classification problem
High = ifelse(Carseats$Sales<=8, "No", "Yes")
CS = data.frame(Carseats, High)
CS$Sales = NULL
#construct dataset
x = CS
x$High = NULL
y = CS$High
# split the data into train+test (50%-50%):
n = nrow(CS)
set.seed(4061)
i.train = sample(1:n, 350)
x.train = x[i.train,]
x.test = x[-i.train,]
y.train = y[i.train]
y.test = y[-i.train]
class(x.train)
class(y.train)
# ?svm : svm有兩種形式
# svmo = svm(xm, y.train, kernel='polynomial')##svm(x, y = NULL, scale = TRUE, type = NULL, kernel ="radial")
# svmo = svm(y~., data=dat.train)##svm(formula, data = NULL, ..., subset, na.action = na.omit, scale = TRUE)
#由于x必須喂一個(gè)matrix,y必須喂一個(gè)numeric,所以x.train=as.matrix(x.train),y.train=as.numeric(y.train)
# (3) Explain why this does not work:
svmo = svm(x.train, y.train, kernel='polynomial')
# >> The problem is the presence of categorical variables in
# the dataset. They must be "recoded" into numerical variables
# for svm() to analyse their spatial contribution.
# (4) Carry out the appropriate fix from your conclusion from (a).
# Then, fit two SVM models, one using a linear kernel, the other
# a polynomial kernel. Compare the two appropriately.
#將兩個(gè)SVM分類器適合于適當(dāng)?shù)摹鞍茨Α庇?xùn)練集,一個(gè)使用線性核函數(shù),另一個(gè)使用多項(xiàng)式核函數(shù)。
#修正的做法:
NC = ncol(x)
# x = x[,-c(NC-1,NC)] # take out the last two columns (predictors)
xm = model.matrix(y~.+0, data=x)#remove the intercept
xm.train = xm[i.train,]
xm.test = xm[-i.train,]
y.train = as.factor(y.train) # so that svm knows it's classification
svmo.lin = svm(xm.train, y.train, kernel='linear')
svmo.pol = svm(xm.train, y.train, kernel='polynomial')
svmy.lin = fitted(svmo.lin)#fitted:顯示x的對(duì)應(yīng)類別y
svmy.pol = fitted(svmo.pol)
table(y.train, fitted(svmo.lin))
table(y.train, fitted(svmo.pol))
# (5) Comparison...
# * visual (there are better ways of visualising!):
par(mfrow=c(1,3))
yl = as.integer(y=="Yes")+1
plot(apply(xm,2,scale), pch=c(15,20)[yl], col=c(1,4)[yl],
? ? cex=c(1.2,2)[yl], main="The data")
#
plot(apply(xm.train,2,scale), pch=c(15,20)[y.train], col=c(1,4)[y.train],
? ? cex=1, main="linear")
points(svmo.lin$SV, pch=5, col=2, cex=1.2)
#
plot(apply(xm.train,2,scale), pch=c(15,20)[y.train], col=c(1,4)[y.train],
? ? cex=1, main="polynomial")
points(svmo.pol$SV, pch=5, col=2, cex=1.2)
# * in terms of training fit:
svmy.lin = fitted(svmo.lin)
svmy.pol = fitted(svmo.pol)
table(y.train, svmy.lin)
table(y.train, svmy.pol)
# * test error:
pred.lin = predict(svmo.lin, newdata=xm.test, probability=TRUE)
pred.pol = predict(svmo.pol, newdata=xm.test)
# ... the above does not work well:
summary(pred.lin)
# --> these are not probabilities! That's because we need to specify
# ", probability=TRUE" also when fitting the SVM, in order to enable
# probabilities to be computed and returned...
# 這些都不是概率!這是因?yàn)槲覀冊(cè)跀M合SVM時(shí)也需要指定“,probability=TRUE”,以便能夠計(jì)算和返回概率……
# SO IF WE WANT TO GENERATE TEST-SET PREDICTIONS, THIS IS THE WAY:
svmo.lin = svm(xm.train, y.train, kernel='linear', probability=TRUE)
svmo.pol = svm(xm.train, y.train, kernel='polynomial', probability=TRUE)
pred.lin = predict(svmo.lin, newdata=xm.test, probability=TRUE)
pred.pol = predict(svmo.pol, newdata=xm.test, probability=TRUE)
y.test = as.factor(y.test)
confusionMatrix(y.test, pred.lin)
confusionMatrix(y.test, pred.pol)
# * AUC (we need to extract P(Y=1|X))
p.lin = attributes(pred.lin)$probabilities[,2]
p.pol = attributes(pred.pol)$probabilities[,2]
y.test = as.factor(y.test)
roc(response=y.test, predictor=p.lin)$auc
roc(response=y.test, predictor=p.pol)$auc
#sensitivity和specificity都很高
###############################################################
#### Exercise 2: 3-class problem (iris dataset) ####
###############################################################
x = iris
x$Species = NULL
y = iris$Species
set.seed(4061)
n = nrow(x)
i.train = sample(1:n, 100)
x.train = x[i.train,]
x.test = x[-i.train,]
y.train = y[i.train]
y.test = y[-i.train]
# (a)
plot(x.train[,1:2], pch=20, col=c(1,2,4)[as.numeric(y.train)], cex=2)
# (b)
dat = data.frame(x.train, y=as.factor(y.train))
svmo.lin = svm(y~., data=dat, kernel='linear')
svmo.pol = svm(y~., data=dat, kernel='polynomial')
svmo.rad = svm(y~., data=dat, kernel='radial')
#
# number of support vectors:
summary(svmo.lin)
summary(svmo.pol)
summary(svmo.rad)
#The number of support vectors less, the more complicated controversy.
# test error:
pred.lin = predict(svmo.lin, newdata=x.test)
pred.pol = predict(svmo.pol, newdata=x.test)
pred.rad = predict(svmo.rad, newdata=x.test)
cm.lin = confusionMatrix(y.test, pred.lin)
cm.pol = confusionMatrix(y.test, pred.pol)
cm.rad = confusionMatrix(y.test, pred.rad)
c(cm.lin$overall[1], cm.pol$overall[1], cm.rad$overall[1])
#rad more accurcte.
# (c) tune the model(via cross-validation)...
set.seed(4061)
svm.tune = e1071::tune(svm, train.x=x.train, train.y=y.train,
? ? ? ? ? ? ? ? ? ? ? kernel='radial',
? ? ? ? ? ? ? ? ? ? ? ranges=list(cost=10^(-2:2), gamma=c(0.5,1,1.5,2)))
print(svm.tune)
names(svm.tune)
# retrieve optimal hyper-parameters
bp = svm.tune$best.parameters
# use these to obtain final SVM fit:
svmo.rad.tuned = svm(y~., data=dat, kernel='radial',
? ? ? ? ? ? ? ? ? ? cost=bp$cost, gamma=bp$gamma)
summary(svmo.rad)
summary(svmo.rad.tuned)#Not changed much,means it's got more to do with the shape of the kernel itself
#, rather than how the model is tunes for this dataset.
# test set predictions from tuned SVM model:
pred.rad.tuned = predict(svmo.rad.tuned, newdata=x.test)
cm.rad.tuned = confusionMatrix(y.test, pred.rad.tuned)
c(cm.rad$overall[1], cm.rad.tuned$overall[1])
# so maybe not an exact science!?
# ... in fact these performances are comparable, bear in mind CV assessment is
# itself subject to variability...
#These are estimates of prediction performance, there is uncertainty related to the points where the value you have here,
#which means when you see 100% or 96%, you are really looking at the same thing. So you need to look at the confidence interval
#around these value to understand what you are really seeing a difference or not.
###############################################################
#### Exercise 3: SVM using caret ####
###############################################################
# Set up the data (take a subset of the Hitters dataset)
data(Hitters)
Hitters = na.omit(Hitters)
dat = Hitters
n = nrow(dat)
NC = ncol(dat)
# Change into a classification problem:
dat$Salary = as.factor(ifelse(Hitters$Salary>median(Hitters$Salary),
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? "High","Low"))
# Data partition
set.seed(4061)
itrain = sample(1:n, size=round(.7*n))
dat.train = dat[itrain,]
dat.validation = dat[-itrain,] # independent validation set
x = dat.train # training set
x$Salary = NULL
y = as.factor(dat.train$Salary)
### Random forest
rf.out = caret::train(Salary~., data=dat.train, method='rf')
rf.pred = predict(rf.out, dat.validation)
rf.cm = confusionMatrix(reference=dat.validation$Salary, data=rf.pred, mode="everything")
### SVM (linear)
svm.out = caret::train(Salary~., data=dat.train, method="svmLinear")
svm.pred = predict(svm.out, dat.validation)
svm.cm = confusionMatrix(reference=dat.validation$Salary, data=svm.pred, mode="everything")
# modelLookup('svmRadial')
### SVM (radial)
svmR.out = caret::train(Salary~., data=dat.train, method="svmRadial")
svmR.pred = predict(svmR.out, dat.validation)
svmR.cm = confusionMatrix(reference=dat.validation$Salary, data=svmR.pred, mode="everything")
perf = rbind(rf.cm$overall, svm.cm$overall, svmR.cm$overall)
row.names(perf) = c("RF","SVM.linear","SVM.radial")
round(perf, 4)
perf = cbind(rf.cm$overall, svm.cm$overall, svmR.cm$overall)
colnames(perf) = c("RF","SVM.linear","SVM.radial")
round(perf, 4)
#評(píng)價(jià)就看accuracy,結(jié)合一開始的圖像,圖像不像是線性的,所以rf應(yīng)該比較好
###############################################################
#### Exercise 4: SVM-based regression ####
###############################################################
x = iris
x$Sepal.Length = NULL
y = iris$Sepal.Length#using Sepal.Length as response variable
pairs(iris[,1:4])
?pairs#pairs生成一個(gè)配對(duì)的散點(diǎn)圖矩陣,矩陣由X中的每列的列變量對(duì)其他各列列變量的散點(diǎn)圖組成
set.seed(4061)
n = nrow(x)
i.train = sample(1:n, 100)
x.train = x[i.train,]
x.test = x[-i.train,]
y.train = y[i.train]
y.test = y[-i.train]
dat.train = cbind(x.train,y=y.train)
# specify statistical training settings:
ctrl = caret::trainControl(method='cv')
# perform statistical training:
svm.o = caret::train(y~., data=dat.train, method="svmLinear",
? ? ? ? ? ? ? ? ? ? trControl=ctrl)#trControl=ctrl
# compute test set predictions:
svm.p = predict(svm.o, newdata=x.test)
# and corresponding MSE:
mean( (y.test-svm.p)^2 )
par(pty='s') #makes or a square plot box
rr = range(c(y.test, svm.p))
plot(y.test, svm.p, pch=20,
? ? xlab="true values", ylab="predicted values",
? ? xlim=rr,ylim=rr)
abline(a=0,b=1)
#Here is a very good enlightenment
#### Section 6 神經(jīng)網(wǎng)絡(luò)####
####分類——?dú)w一化;回歸——標(biāo)準(zhǔn)化####
#learning rate:is applied to sort of calibrate the speed of the learning process data.
#DL deep learning vs ML meachine learning
------------------------------------------------------------
#### Example 1 : iris data with neuralnet ####
#install.packages('neuralnet')
library(neuralnet)
n = nrow(iris)
dat = iris[sample(1:n), ] # shuffle initial dataset
NC = ncol(dat)
nno = neuralnet(Species~., data=dat, hidden=c(6,5))#知道我們要建幾層,每層幾個(gè)節(jié)點(diǎn)時(shí)
#hidden? 第一層6個(gè),第二層5個(gè)的神經(jīng)網(wǎng)絡(luò)
plot(nno,
? ? information=FALSE,#不要標(biāo)數(shù)據(jù)
? ? col.entry='red',
? ? col.out='green',
? ? show.weights=FALSE)
plot(nno,
? ? information=TRUE,#要標(biāo)數(shù)據(jù)
? ? col.entry='red',
? ? col.out='green',
? ? show.weights=TRUE)
#(1)‘blue’ bits are bias
#(2)在黑色線條上的數(shù)字是weights, 可以是positive, 也可以是negative的
#### Example 2: single layer NN - regression - effect of scaling ####
library(nnet)? ? # implements single layer NNs
library(mlbench) # includes dataset BostonHousing
#install.packages('mlbench')
data(BostonHousing) # load the dataset
#View(BostonHousing)
# train neural net
n = nrow(BostonHousing)
itrain = sample(1:n, round(.7*n), replace=FALSE)#要70%的數(shù)據(jù)并取整作為訓(xùn)練樣本
nno = nnet(medv~., data=BostonHousing, subset=itrain, size=5)#size 隱藏層中的單元數(shù)
#只知道幾個(gè)神經(jīng)元,但不知道有幾層
#?nnet
summary(nno$fitted.values)
#We saw the fitted value are all 1, it because the information was not scaled, we should do scale first.
# the above output indicates the algorithm did not
# converge, probably due to explosion of the gradients...
#都是1,說明算法沒有收斂,需要?dú)w一化值(50 是這個(gè)數(shù)據(jù)的最大值,除以最大值):
# We try again, this time normalizing the values
# (50 is the max value for this data, see its range):
nno = nnet(medv/50~., data=BostonHousing, subset=itrain, size=5)
summary(nno$fitted.values)# there was thus a need to normalise the response variable...
# 測(cè)試神經(jīng)網(wǎng)絡(luò)
preds = predict(nno, newdata=BostonHousing[-itrain,]) * 50 # (we multiply by 50 to fall back to original data domain)
#(由于之前除以了50,我們乘以 50 以回退到原始數(shù)據(jù)域)
summary(preds)
# RMSE:偏方誤差
sqrt(mean((preds-BostonHousing$medv[-itrain])^2))
# compare with lm():
lmo = lm(medv~., data=BostonHousing, subset=itrain)
lm.preds = predict(lmo, newdata= BostonHousing[-itrain,])
# RMSE:
sqrt(mean((lm.preds-BostonHousing$medv[-itrain])^2))
#Compare with lm, we have a lower test RMSE.
#平價(jià)的時(shí)候看一下length(itrain)和dim(BostonHousing),看有沒有足夠的訓(xùn)練樣本和測(cè)試樣本
# Further diagnostics may highlight various aspects of the
# model fit - always run these checks!每次都要檢查跟lm的比較
# 進(jìn)一步的診斷可能會(huì)突出模型擬合的各個(gè)方面 - 始終運(yùn)行這些檢查!
par(mfrow=c(2,2))
################################################################################
#PLOT(1): residuals form LM against NN
plot(lmo$residuals, nno$residuals*50)
abline(a=0, b=1, col="limegreen")
#there are some extreme residuals from LM
################################################################################
#PLOT(2): residuals from LM against Original Train Data - no relationship
plot(BostonHousing$medv[itrain], lmo$residuals, pch=20)
################################################################################
#PLOT(3): residuals from LM against Original Train Data (in grey)
#? ? ? ? plus(+) residuals from NN against Original Train Data
plot(BostonHousing$medv[itrain], lmo$residuals, pch=20, col=8)
points(BostonHousing$medv[itrain], nno$residuals*50, pch=20)
################################################################################
#PLOT(4): QQ plot
#In general, Noise is assumed to be normal distributed or Gaussian...
#But in NN, we do not make this assumption. Since between X(input) and Y(output), we hope to use some non-linear function.
#But QQ plot is still a useful tool.
qqnorm(nno$residuals)
abline(a=mean(nno$residuals), b=sd(nno$residuals), col=2)
#### Example 3: effect of tuning parameters (iris data) ####
rm(list=ls())
n = nrow(iris)
# 像往常一樣打亂初始數(shù)據(jù)集(刪除第 4 個(gè)值,減少數(shù)據(jù)量,使數(shù)據(jù)更難準(zhǔn)確)打亂數(shù)據(jù)、重排 #移除Petal.Width數(shù)據(jù)
dat = iris[sample(1:n),-4]
NC = ncol(dat)
#data scaling 不是變成0,1分布,而是將數(shù)據(jù)縮放至為 [0,1]
####scale function: y_normalized = (y-min(y)) / (max(y)-min(y)):####
#dat離第四列character型的數(shù)據(jù)不用歸一化,所以dat[,-NC]
mins = apply(dat[,-NC],2,min)
maxs = apply(dat[,-NC],2,max)
dats = dat
dats[,-NC] = scale(dats[,-NC],center=mins,scale=maxs-mins)
# 設(shè)置訓(xùn)練樣本:
itrain = sample(1:n, round(.7*n), replace=FALSE)
nno = nnet(Species~., data=dats, subset=itrain, size=5)
# 預(yù)測(cè):
nnop = predict(nno, dats[-itrain,])
head(nnop)#返回向量、矩陣、表格、數(shù)據(jù)框或函數(shù)的第一部分? ?
#這是從概率中獲取預(yù)測(cè)標(biāo)簽的一種方法:
preds = max.col(nnop) #找到矩陣每一行的最大位置在第幾列,用來做混淆矩陣
#(上面的行為每一行選擇概率最高的列,即每個(gè)觀察值)或者我們可以直接使用它:
preds = predict(nno, dats[-itrain,], type='class')
tbp = table(preds, dats$Species[-itrain])#混淆矩陣
sum(diag(tbp))/sum(tbp) #準(zhǔn)確率
# #nnet里size怎樣影響正確率
####找到最合適的size####
# 在這里,我們嘗試使用 1 到 10 的尺寸進(jìn)行說明,但您可以隨意使用這些值!
sizes = c(1:10)
rate = numeric(length(sizes)) # 訓(xùn)練集分類準(zhǔn)確率
ratep = numeric(length(sizes)) # 測(cè)試集分類準(zhǔn)確率
for(d in 1:length(sizes)){
? nno = nnet(Species~., data=dats, subset=itrain,
? ? ? ? ? ? size=sizes[d])
? tb = table(max.col(nno$fitted.values), dats$Species[itrain])
? rate[d] = sum(diag(tb))/sum(tb)
? # now looking at test set predictions
? nnop = predict(nno, dats[-itrain,])
? tbp = table(max.col(nnop), dats$Species[-itrain])
? ratep[d] = sum(diag(tbp))/sum(tbp)
}
plot(rate, pch=20, t='b', xlab="layer size", ylim=range(c(rate,ratep)))
points(ratep, pch=15, t='b', col=2)
legend('bottomright', legend=c('training','testing'),
? ? ? pch=c(20,15), col=c(1,2), bty='n')
# 注意訓(xùn)練集和測(cè)試集的表現(xiàn)不一定相似......
#由此找到最好最合適的size
#nnet里decay(權(quán)重衰減參數(shù),默認(rèn)為 0) 怎樣影響正確率
decays = seq(1,.0001,lengt=11)
rate = numeric(length(decays)) # train-set classification rate
ratep = numeric(length(decays)) # test-set classification rate
for(d in 1:length(decays)){
? # fit NN with that particular decay value (decays[d]):
? nno = nnet(Species~., data=dats, subset=itrain, size=10,
? ? ? ? ? ? decay=decays[d])
? # corresponding train set confusion matrix:
? tb = table(max.col(nno$fitted.values), dats$Species[itrain])
? rate[d] = sum(diag(tb))/sum(tb)
? # now looking at test set predictions:
? nnop = predict(nno, dats[-itrain,])
? tbp = table(max.col(nnop), dats$Species[-itrain])
? ratep[d] = sum(diag(tbp))/sum(tbp)
}
plot(decays, rate, pch=20, t='b', ylim=range(c(rate,ratep)))
points(decays, ratep, pch=15, t='b', col=2)
legend('topright', legend=c('training','testing'),
? ? ? pch=c(20,15), col=c(1,2), bty='n')
rm(list=ls())
######Exercise ######
#### Exercise 1####
# 1. What type of neural network does this code implement? FFNN
#有兩種函數(shù)能實(shí)現(xiàn)神經(jīng)網(wǎng)絡(luò),一種是step function——大于某一個(gè)值是1,小于是0,非黑即白;一種是activation function——有確切的數(shù)字輸出
# 2.
library(MASS)
library(neuralnet)
# --- NN with one 10-node hidden layer
nms = names(Boston)[-14]
f = as.formula(paste("medv ~", paste(nms, collapse = " + ")))
# fit a single-layer, 10-neuron NN:
set.seed(4061)
out.nn = neuralnet(f, data=Boston, hidden=c(10), rep=5,
? ? ? ? ? ? ? ? ? linear.output=FALSE)
#plot(out.nn, information=TRUE, col.entry='red', col.out='green',show.weights=TRUE)
#create single-hidden layer neural network and repeat 5 times
# without using an activation function:
set.seed(4061)
out.nn.lin = neuralnet(f, data=Boston, hidden=c(10), rep=1,
? ? ? ? ? ? ? ? ? ? ? linear.output=TRUE)
# Warning message: 算法在 stepmax 內(nèi)的 1 次重復(fù)中沒有收斂,所以需要運(yùn)行此代碼兩遍
# Algorithm did not converge in 1 of 1 repetition(s) within the stepmax.
#線性輸出:
set.seed(4061)
out.nn.tanh = neuralnet(f, data=Boston, hidden=c(10), rep=5,
? ? ? ? ? ? ? ? ? ? ? ? linear.output=FALSE, act.fct='tanh')
p1 = predict(out.nn, newdata=Boston)
p2 = predict(out.nn.tanh, newdata=Boston)
sqrt(mean((p1-Boston$medv)^2))
sqrt(mean((p2-Boston$medv)^2))
#參數(shù):
#linear.output: logical,線性輸出為TRUE,nonlinear為FALSE
#rep: 神經(jīng)網(wǎng)絡(luò)訓(xùn)練的重復(fù)次數(shù)
#act.fct: 一個(gè)可微函數(shù),用于平滑協(xié)變量或神經(jīng)元與權(quán)重的叉積的結(jié)果
#act.fct: 默認(rèn)“l(fā)ogistic”,也可以是“tanh”
#### Exercise 2 ####
library(neuralnet)
set.seed(4061)
n = nrow(iris)
dat = iris[sample(1:n), ] # shuffle initial dataset
NC = ncol(dat)
nno = neuralnet(Species~., data=dat, hidden=c(6,5))
plot(nno)
#### Exercise 3 #####
#Load dataset MASS::Boston and perform a 70%-30% split for training and test sets
#respectively. Use set.seed(4061) when splitting the data and also every time you run a
#neural network.
#1. Compare single-layer neural network fits from the neuralnet and nnet libraries.
#Can you explain any difference you may find?
# 2. Change the "threshold" argument value to 0.001 in the call to neuralnet, and
#comment on your findings (this run might take a bit more time to converge)
#加載數(shù)據(jù)集 MASS::Boston 并分別對(duì)訓(xùn)練集和測(cè)試集執(zhí)行 70%-30% 的拆分。
#1. 比較來自神經(jīng)網(wǎng)絡(luò)和 nnet 庫的單層神經(jīng)網(wǎng)絡(luò)擬合。
#解釋可能發(fā)現(xiàn)的任何區(qū)別嗎?
#2. 在對(duì)神經(jīng)網(wǎng)絡(luò)的調(diào)用中將“閾值”參數(shù)值更改為 0.001,
#并評(píng)論發(fā)現(xiàn)(此運(yùn)行可能需要更多時(shí)間才能收斂)
#當(dāng)外界刺激達(dá)到一定的閥值時(shí),神經(jīng)元才會(huì)受刺激,影響下一個(gè)神經(jīng)元。
#超過閾值,就會(huì)引起某一變化,不超過閾值,無論是多少,都不產(chǎn)生影響
rm(list=ls())
library(neuralnet)
library(nnet)? ? # implements single layer NNs
library(MASS) # includes dataset BostonHousing
data(Boston) # load the dataset
# train neural nets
n = nrow(Boston)
itrain = sample(1:n, round(.7*n), replace=FALSE)
dat = Boston
dat$medv = dat$medv/50 #歸一化
dat.train = dat[itrain,]
dat.test = dat[-itrain,-14]#自變量的測(cè)試樣本
y.test = dat[-itrain,14]#因變量的測(cè)試樣本
#nnet 單層五個(gè)神經(jīng)元
nno1 = nnet(medv~., data=dat.train, size=5, linout=TRUE)
fit1 = nno1$fitted.values
mean((fit1-dat.train$medv)^2) #偏差
#neuralnet 單層五個(gè)神經(jīng)元
nno2 = neuralnet(medv~., data=dat.train, hidden=c(5), linear.output=TRUE)
fit2 = predict(nno2, newdata=dat.train)[,1]
mean((fit2-dat.train$medv)^2) #偏差
##閾值threshold0.0001的neuralnet
nms = names(dat)[-14]
f = as.formula(paste("medv ~", paste(nms, collapse = " + ")))#下面所用的函數(shù)太長,所以先寫出來
nno3 = neuralnet(f, data=dat.train, hidden=5, threshold=0.0001)#threshold 閾值####f跟medv有啥區(qū)別??####
#Threshold in 'neuralnet' is originally 0.01. Now we set it to be 0.0001.
fit3 = predict(nno3, newdata=dat.train)[,1]
mean((fit3-dat.train$medv)^2)#0.005276877 #even much better!
#用mean來看模型能不能用,mean不能太大
# test neural nets predict一定要乘回去50
y.test = y.test*50
p1 = predict(nno1, newdata=dat.test)*50
p2 = predict(nno2, newdata=dat.test)*50
p3 = predict(nno3, newdata=dat.test)*50
mean((p1-y.test)^2)
mean((p2-y.test)^2)
mean((p3-y.test)^2)
#test的mean用來看哪個(gè)模型更好
# explain these differences?
names(nno1)#nnet
names(nno2)#neuralnet
# nnet:
# - activation function: logistic
# - algorithm: BFGS in optim
# - decay: 0
# - learning rate: NA
# - maxit: 100
# neuralnet:
# - activation function: logistic
# - algorithm: (some form of) backpropagation
# - decay: ?
# - learning rate: depending on algorithm
# - maxit:?
# so what is it?
#nnet里的activation function:
#不是所有的信號(hào)都要做反應(yīng),需要activation function去看需要對(duì)哪些信號(hào)作出反應(yīng)
#hide層和output層都有activation function
#hide層的activation function是由act.fun決定的——tanch正切或者logistic
#output層的activation function是由linout決定的
#做regression時(shí)一般linout=T,表明output層的activation function是identical的,就是輸入是啥輸出就是啥,不用做改變
#linout=F output的activation function是logistic,輸出值要變成邏輯變量
#默認(rèn)值hide和output都是logistic
####? Exercise 4 ####
#Fit a single-layer feed-forward neural network using nnet to
#Report on fitted values.
#使用 nnet 擬合單層前饋神經(jīng)網(wǎng)絡(luò)
#報(bào)告擬合值。
rm(list=ls())
library(caret)
library(neuralnet)
library(nnet)
library(ISLR)
#set up the data (take a subset of the Hitters dataset)設(shè)置數(shù)據(jù)(獲取 Hitters 數(shù)據(jù)集的子集)
dat = na.omit(Hitters) #返回刪除NA后的向量a? 因?yàn)樵摂?shù)據(jù)里有缺失值
#is.na(Hitters)
n = nrow(dat)
NC = ncol(dat)
# Then try again after normalizing the response variable to [0,1]:將響應(yīng)變量歸一化為 [0,1]
dats = dat
dats$Salary = (dat$Salary-min(dat$Salary)) / diff(range(dat$Salary))
# train neural net
itrain = sample(1:n, round(.7*n), replace=FALSE)
dat.train = dat[itrain,]
dats.train = dats[itrain,]
dat.test = dat[-itrain,]
dats.test = dats[-itrain,]
#data Salary which is not scaled: do not work
#歸一化前 dat是歸一化前,dats是歸一化后
nno = nnet(Salary~., data=dat.train, size=10, decay=c(0.1))
summary(nno$fitted.values)
#data Salary is scaled, but no regularization: do not work either
#歸一化后
nno.s = nnet(Salary~., data=dats.train, size=10, decay=c(0))
summary(nno.s$fitted.values)
#data Salary is scaled, and also have regularization progress: works!
#歸一化后
nno.s = nnet(Salary~., data=dats.train, size=10, decay=c(0.1))
summary(nno.s$fitted.values)
#Our last attempt above was a success.
#But we should be able to get a proper fit even for decay=0...
#what's going on? Can you get it to work?
#改進(jìn)!添加系數(shù)linout=1
#(A1) Well, it's one of these small details in how you call a function;
#here we have to specify 'linout=1' because we are considering a regression problem
#for regression problem: class k = 1
#data Salary which is not scaled:
set.seed(4061)
nno = nnet(Salary~., data=dat.train, size=10, decay=c(0.1), linout=1)
summary(nno$fitted.values)
#data Salary which is scaled, but with no decay:
set.seed(4061)
nno.s = nnet(Salary~., data=dats.train, size=10, decay=c(0), linout=1)
summary(nno.s$fitted.values)
#data Salary which is scaled, and also with decay:
set.seed(4061)
nno.s = nnet(Salary~., data=dats.train, size=10, decay=c(0.1), linout=1)
summary(nno.s$fitted.values)
#改進(jìn)!寫function
# (A2) but let's do the whole thing again more cleanly...
# 重新編碼和放縮數(shù)據(jù)對(duì)結(jié)果影響的比較
# re-encode and scale dataset properly? 正確重新編碼和縮放數(shù)據(jù)集
myrecode <- function(x){
? # function recoding levels into numerical values
? #函數(shù)將級(jí)別重新編碼為數(shù)值
? if(is.factor(x)){
? ? levels(x)
? ? return(as.numeric(x))
? } else {
? ? return(x)
? }
}
myscale <- function(x){
? # function applying normalization to [0,1] scale
? #對(duì) [0,1] 尺度應(yīng)用歸一化的函數(shù)
? minx = min(x,na.rm=TRUE)
? maxx = max(x,na.rm=TRUE)
? return((x-minx)/(maxx-minx))
}
datss = data.frame(lapply(dat,myrecode))
datss = data.frame(lapply(datss,myscale))
# replicate same train-test split:
#復(fù)制相同的訓(xùn)練測(cè)試拆分:
datss.train = datss[itrain,]
datss.test = datss[-itrain,]
nno.ss.check = nnet(Salary~., data=datss.train, size=10, decay=0, linout=1)
summary(nno.ss.check$fitted.values)
# use same scaled data but with decay as before:
#使用相同的縮放數(shù)據(jù),但與以前一樣衰減:
nno.ss = nnet(Salary~., data=datss.train, size=10, decay=c(0.1), linout=1)
summary(nno.ss$fitted.values)
# evaluate on test data (with same decay for both models):
#評(píng)估測(cè)試數(shù)據(jù)(兩個(gè)模型的衰減相同):
datss.test$Salary - dats.test$Salary
pred.s = predict(nno.s, newdata=dats.test)
pred.ss = predict(nno.ss, newdata=datss.test)
mean((dats.test$Salary-pred.s)^2)
mean((datss.test$Salary-pred.ss)^2)
#Feed-forward neural network(FFNN)
#? Single or multiplelayers 單層或多層
#? Forward propagationonly 僅前向傳播
#? Number of layers determines function complexity 層數(shù)決定功能復(fù)雜度
#? Typically uses a nonlinear activation function 通常使用非線性激活函數(shù)
#? Some definitions specify a unique hidden layer, others allow any number of layers一些定義指定一個(gè)唯一的隱藏層,其他定義允許任意數(shù)量的層
#Multilayer Perceptron(MLP)
#Recurrent neural network(RNN)
#Long short-term memory neural network(LSTMNN)
#Convolutional neural network(CNN)
#### Exercise 6: Olden index #####
#使用 NeuralNetTools::olden() 計(jì)算以下數(shù)據(jù)集的變量重要性,
#每次擬合一個(gè) 7 神經(jīng)元單層 FFNN (nnet):
#1. 鳶尾花數(shù)據(jù)集(使用全套);
#2. 波士頓數(shù)據(jù)集
#。 與從隨機(jī)森林獲得的變量重要性評(píng)估進(jìn)行比較。
rm(list=ls())
library(nnet)
library(NeuralNetTools)
library(randomForest)
library(MASS)
myscale <- function(x){
? minx = min(x,na.rm=TRUE)
? maxx = max(x,na.rm=TRUE)
? return((x-minx)/(maxx-minx))
}
# (1) Iris data
# shuffle dataset...
set.seed(4061)
n = nrow(iris)
dat = iris[sample(1:n),]
# rescale predictors...
dat[,1:4] = myscale(dat[,1:4])
# fit Feed-Forward Neural Network...
set.seed(4061)
nno = nnet(Species~., data=dat, size=c(7), linout=FALSE, entropy=TRUE)
pis = nno$fitted.values
matplot(pis, col=c(1,2,4), pch=20)
y.hat = apply(pis, 1, which.max) # fitted values
table(y.hat, dat$Species)
# compute variable importance...
#神經(jīng)網(wǎng)絡(luò)中輸入變量的相對(duì)重要性作為原始輸入隱藏、隱藏輸出連接權(quán)重的乘積之和
vimp.setosa = olden(nno, out_var='setosa', bar_plot=FALSE)
vimp.virginica = olden(nno, out_var='virginica', bar_plot=FALSE)
vimp.versicolor = olden(nno, out_var='versicolor', bar_plot=FALSE)
names(vimp.setosa)
par(mfrow=c(1,2))
plot(iris[,3:4], pch=20, col=c(1,2,4)[iris$Species], cex=2)
plot(iris[,c(1,3)], pch=20, col=c(1,2,4)[iris$Species], cex=2)
dev.new()
plot(olden(nno, out_var='setosa'))
plot(olden(nno, out_var='virginica'))
plot(olden(nno, out_var='versicolor'))
v.imp = cbind(vimp.setosa$importance, vimp.virginica$importance, vimp.versicolor$importance)
rownames(v.imp) = names(dat)[1:4]
colnames(v.imp) = levels(dat$Species)
(v.imp)
#正負(fù)值代表positive 還是nagative effect
#絕對(duì)值越大,自變量對(duì)因變量的影響越大
# fit RF...
set.seed(4061)
rfo = randomForest(Species~., data=dat, ntrees=1000)
rfo$importance
# how can we compare variable importance assessments?
cbind(apply(v.imp, 1, sum), #所有自變量放在一起對(duì)y的影響重要性(有抵消)
? ? ? apply(abs(v.imp), 1, sum), #取絕對(duì)值
? ? ? rfo$importance)
#三個(gè)值都大的自變量have overall contribution
# (2) Boston data
set.seed(4061)
n = nrow(Boston)
dat = Boston[sample(1:n),]
# rescale predictors...
dats = myscale(dat)
dats$medv = dat$medv/50
set.seed(4061)
nno = nnet(medv~., data=dats, size=7, linout=1)
y.hat = nno$fitted.values
plot(y.hat*50, dat$medv)#從圖中看準(zhǔn)確度
mean((y.hat*50-dat$medv)^2)#偏差
v.imp = olden(nno, bar_plot=FALSE)
plot(v.imp)
# fit RF...里面的重要性,跟神經(jīng)網(wǎng)絡(luò)擬合得到的重要性進(jìn)行比較
set.seed(4061)
rfo = randomForest(medv~., data=dat, ntrees=1000)
rfo$importance
# how can we compare variable importance assessments?我們?nèi)绾伪容^變量重要性評(píng)估?
cbind(v.imp, rfo$importance)
round(cbind(v.imp/sum(abs(v.imp)),
? ? ? ? ? ? rfo$importance/sum(rfo$importance)),3)*100 #重要性的百分比
#一些變量兩邊數(shù)都大,突出standout
#一些變量差距兩邊很大
# should we use absolute values of Olden's index?應(yīng)該使用奧爾登指數(shù)的絕對(duì)值
par(mfrow=c(2,1))
barplot(abs(v.imp[,1]), main="importance from NN",
? ? ? ? names=rownames(v.imp), las=2)
barplot(rfo$importance[,1], main="importance from RF",
? ? ? ? names=rownames(v.imp), las=2)
# 作圖比較
# or possibly normalize across all values for proportional contribution?
par(mfrow=c(2,1))
NNN = sum(abs(v.imp[,1]))
NRF = sum(abs(rfo$importance[,1]))
barplot(abs(v.imp[,1])/NNN, main="importance from NN",
? ? ? ? names=rownames(v.imp), las=2)
barplot(rfo$importance[,1]/NRF, main="importance from RF",
? ? ? ? names=rownames(v.imp), las=2)
# 把上面兩個(gè)圖合在一起looks alright... now make it a nicer comparative plot :)
par(font=2, font.axis=2)
imps = rbind(NN=abs(v.imp[,1])/NNN, RF=rfo$importance[,1]/NRF)
cols = c('cyan','pink')
barplot(imps, names=colnames(imps), las=2, beside=TRUE,
? ? ? ? col=cols,
? ? ? ? ylab="relative importance (%)",
? ? ? ? main="Variable importance from NN and RF")
legend("topleft", legend=c('NN','RF'), col=cols, bty='n', pch=15)
####Section 7 Feature Selection ####
library(ISLR)
library(leaps) # contains regsubsets()
#查看哪些變量重要,那些變量不需要
####Exercise 1: best subset selection ####
rm(list=ls())
Hitters = na.omit(Hitters)
dim(Hitters)
# (1) 執(zhí)行最佳子集選擇
reg.full = regsubsets(Salary~., data=Hitters, method="exhaustive")
# method=c("exhaustive","backward", "forward", "seqrep"),
# method="exhaustive"窮舉 是最優(yōu)的
#(2)解釋標(biāo)準(zhǔn) regsubsets() 輸出。
names(summary(reg.full))
summary(reg.full) #有*表示要這個(gè)變量;沒*就不要這個(gè)變量
summary(reg.full)$which? ? ? # 追蹤窮舉過程
summary(reg.full)$outmat? ? # 追蹤窮舉過程
#舉例最后的結(jié)果是:
#y~x1+x5
#y~x2+x3+x4
# (3)將 RSS 繪制為變量數(shù)量的函數(shù)
RSS = summary(reg.full)$rss
plot(RSS, pch=20, t='b', xlab="Number of covariates", ylab='RSS')#殘差平方和
# 隨著Number of covariates的增大,RSS依舊不能趨于穩(wěn)定,那么我們要換adjusted R square調(diào)整后的 r 平方作為判斷標(biāo)準(zhǔn)
R2adj = summary(reg.full)$adjr2 #調(diào)整后的 r 平方
plot(R2adj, pch=20, t='b', xlab="Number of covariates",
? ? ylab='Adjusted R^2')
#r 相關(guān)系數(shù)
#增加自變量的個(gè)數(shù)時(shí),判定系數(shù)就會(huì)增加,
#即隨著自變量的增多,R平方會(huì)越來越大,
#會(huì)顯得回歸模型精度很高,有較好的擬合效果。而實(shí)際上可能并非如此,
#有些自變量與因變量(即預(yù)測(cè))完全不相關(guān),
#增加這些自變量,并不會(huì)提升擬合水平和預(yù)測(cè)精度。為避免這種現(xiàn)象
#因此需要調(diào)整后的R平方
# (4)將最終模型的所需大小增加到 19,并找出哪個(gè)子模型有:
#最小的 RSS
#最高調(diào)整后的 R2
reg.full = regsubsets(Salary~., data=Hitters, nvmax=19)
par(mfrow=c(1,2))
RSS = summary(reg.full)$rss
plot(RSS, pch=20, t='b', xlab="Number of covariates", ylab='RSS')
#到10的時(shí)候才開始趨于穩(wěn)定,RSS不好
R2 = summary(reg.full)$rsq? ? ? #調(diào)整前的 r 平方
R2adj = summary(reg.full)$adjr2? #調(diào)整后的 r 平方
plot(R2, pch=20, t='b', xlab="Number of covariates",
? ? ylab='Original and adjusted R^2')
points(R2adj, col=4, pch=15, t='b')
# 找到最優(yōu)值,也就是最好有幾個(gè)變量
R2adj.index = which.max(R2adj)
# 把這個(gè)值畫在圖里
abline(v = R2adj.index)
#11個(gè)變量
#提取找到對(duì)應(yīng)的模型:
summary(reg.full)$outmat[R2adj.index,]
mod = summary(reg.full)$which[R2adj.index,]#提取需要的
names(which(mod))
# 畫對(duì)應(yīng)的heatmap:
plot(reg.full, scale="adjr2")
summary(reg.full)$outmat
# 與上面的追蹤相對(duì)應(yīng)
#0.5為調(diào)整后的R平方的臨界值,
#如果調(diào)整后的R平方小于0.5,則要分析我們所采用和未采用的自變量。
#如果調(diào)整后的R平方與R平方存在明顯差異,
#則意味著所用的自變量不能很好的測(cè)算因變量的變化,
#或者是遺漏了一些可用的自變量
#調(diào)整后的R平方與R平方間差距越大,模型的擬合越差。
#### Exercise 2: fwd/bwd subset selection ####
#1. 使用 jumps::regsubsets() 執(zhí)行向前和向后選擇。
#2. 比較 RSS 和 Adjusted R2 w.r.t 的圖。 模型尺寸。
#3. 比較通過每種方法選擇的 4 變量模型中的協(xié)變量。
#4. 以任何其他方式探索最終的模型選擇
rm(list=ls())
Hitters = na.omit(Hitters)
dim(Hitters)
#(1)使用兩種方法
reg.fwd = regsubsets(Salary~., data=Hitters, nvmax=19, method="forward")
reg.bwd = regsubsets(Salary~., data=Hitters, nvmax=19, method="backward")
#(2)比較
par(mfrow=c(1,2))
#畫rss圖
plot(summary(reg.bwd)$rss, t='b', pch=20, cex=1.5)
points(summary(reg.fwd)$rss, t='b', pch=15, col=2)
#畫adjr2圖
plot(summary(reg.bwd)$adjr2, t='b', pch=20, cex=1.5)
points(summary(reg.fwd)$adjr2, t='b', pch=15, col=2)
#最優(yōu)值
R2adj.fwd.index = which.max(summary(reg.fwd)$adjr2)
R2adj.bwd.index = which.max(summary(reg.bwd)$adjr2)
abline(v=c(R2adj.bwd.index, R2adj.fwd.index), col=c(1,2))
#(3)找每個(gè)模型中有4個(gè)變量的模型
# 4-variable models:
#id:變量個(gè)數(shù)
coef(reg.fwd, id=4)
coef(reg.bwd, id=4)
# 但最好的還是"exhaustive"
reg.full = regsubsets(Salary~., data=Hitters, method="exhaustive")
names(which(summary(reg.full)$which[4,]==TRUE))
# 從反向消除過程中提取最優(yōu)模型:
coef(reg.bwd, id=R2adj.bwd.index)
#### Exercise 3: generate predictions from regsubsets() output?####
#但是在Exercise 5里,加了兩行代碼就能用了
rm(list=ls())
dat = na.omit(Hitters)
n = nrow(dat)
itrain = sample(1:n, 150)
reg.fwd = regsubsets(Salary~., data=dat, nvmax=10,
? ? ? ? ? ? ? ? ? ? method="forward", subset=itrain)
#predict(reg.fwd, newdata=dat[-itrain,])
#predict不能用,得自己寫predict
beta.hat = coef(reg.fwd, id=4)? ?
# create matrix X:
test.dat = model.matrix(Salary~., data = dat[-itrain,])
# get the test data matrix:
Xtest = test.dat[,names(beta.hat)]
# compute (X Beta^T) as in Y = (X Beta^t) + Epsilon:
pred = Xtest %*% beta.hat?
pred = as.numeric(pred)? ? # make this a vector instead
# compute prediction RMSE:
sqrt( mean((pred - dat$Salary[-itrain])^2) )
####Exercise 4: fit and predict using stats::step() ####
#使用 R 的基本包 stats 中的函數(shù) step() 對(duì)完整的 Hitters 數(shù)據(jù)集執(zhí)行逐步選擇(一旦刪除了缺失值的觀察,如練習(xí) 2 中所示)。
#1. 比較 step() 和 regsubsets() 的后向選擇輸出。
#2. 使用 step() 引用通過反向選擇選擇的最終模型的模型擬合摘要。
#step : 在逐步算法中通過 AIC 選擇模型
rm(list=ls())
Hitters = na.omit(Hitters)
dim(Hitters)
# 從 regsubsets() 中逐步選擇:
reg.fwd = regsubsets(Salary~., data=Hitters, nvmax=19, method="forward")
reg.bwd = regsubsets(Salary~., data=Hitters, nvmax=19, method="backward")
# 來自 step() 的逐步選擇:
lm.out = lm(Salary~., data=Hitters) # full model
step.bth = step(lm.out, direction="both")#默認(rèn)方法
step.fwd = step(lm.out, direction="forward")
step.bck = step(lm.out, direction="backward")
# compare backward selections from regsubsets() and step():
#
# ... from step()...
coef(step.bck)
length(coef(step.bck))
summary(step.bck)
# Nice: we get the model directly! No need to reconstruct
# fitted values by hand this time!
#
# ... from regsubsets()...
i.opt = which.min(summary(reg.bwd)$bic)
i.set.opt = summary(reg.bwd)$which[i.opt,]
summary(reg.bwd)$which[i.opt, i.set.opt]
# Different models, but smaller one is included in larger one.
# Difference is likely due to using different criteria
# (AIC v BIC, BIC yielding a smaller model).
# NB: we can also assess feature contributions in terms
# of magnitude of their effect:
coefs = abs(coef(step.fwd))/sum(abs(coef(step.fwd)), na.rm=TRUE)*100
coefs = coefs[-1]
coefs[order(abs(coefs), decreasing=TRUE)]
#從大到小每個(gè)變量占重要性(1)的比率,后面應(yīng)該加 %
####Exercise 5: generate predictions from step() output? ####
rm(list=ls())
dat = na.omit(Hitters)
n = nrow(dat)
itrain = sample(1:n, 150)
lmo = lm(Salary~., data=dat, subset=itrain)#這是新添加的
reg.fwd = step(lmo, direction="forward")#這是新添加的
pred = predict(reg.fwd, newdata=dat[-itrain,])
sqrt(mean((pred - dat$Salary[-itrain])^2))
#### 帶插入符號(hào)的逐步(邏輯)回歸 #####
rm(list=ls())
dat = na.omit(Hitters)
n = nrow(dat)
dat = dat[sample(1:n,n),]
dat$y = as.factor(dat$Salary>mean(dat$Salary))#是否大于均值
dat$Salary = NULL
levels(dat$y) = c("low","hi")
# 分層拆分為訓(xùn)練+測(cè)試數(shù)據(jù)集
itrain = createDataPartition(dat$y, p=.75, times=1)[[1]]
dtrain = dat[itrain,]
dtest = dat[-itrain,]
trC = trainControl(method="cv", number=5,
? ? ? ? ? ? ? ? ? savePredictions = TRUE,
? ? ? ? ? ? ? ? ? classProbs = TRUE)
co = train(y~., data=dtrain, method='glmStepAIC',
? ? ? ? ? trControl=trC, distribution='binomial')
summary(co$finalModel)
names(co)
preds <- predict(co, dtest)
probs <- predict(co, dtest, type="prob")
table(dtest$y,preds)
pd = data.frame(obs=dtest$y,pred=preds,low=probs$low)
twoClassSummary(pd, lev=levels(dtest$y))
#### on simulated data ####
rm(list=ls())
library(glmnet)
library(leaps)
library(caret)
# We start by creating a dummy data set, so that we know which
# features actually make up the observations. Here a linear
# combination of nonlinear expression of 3 features are used
# to create observations Y.
# The 3 features are:
# - education level
# - number of years of experience
# - some employee rating assessed by their company
# The response variable Y is salary.
# Another 20 features containing random noise are also created
# and added to the dataset. They should not be selected by our
# model selection method...
n = 500 # desired sample size
# level of education (1=secondary level, 2=BSc, 3=MSc, 4=PhD/MBA):
edu = sample(c(1:4), size=n, prob=c(.05,.5,.35,.1), replace=TRUE)
# nbr years experience:
yex = rpois(n=n, lambda=6)
# some obscure employee rating made up by the company:
ert = pmin(5, pmax(0, 5-rexp(n=n, rate=2)))
# employee salary (response variable):
sal = 2*exp(.15*yex) + 3.2*log(edu) + 4*ert
par(mfrow=c(2,2))
plot(factor(edu), main="education")
hist(yex, main="years experience")
hist(ert, main="employee rating")
hist(sal, main="Salaries")
par(mfrow=c(1,3), pch=20)
boxplot(sal~factor(edu), main="salary wrt\n education")
plot(yex, sal, main="salary v\n years experience")
plot(ert, sal, main="salary v\n employee rating")
# now make up some dummy features...
# we don't bother changes scales/means since we will
# be normalizing these features...
p = 20
xtra = matrix(rnorm(n*p), ncol=p, nrow=n)
colnames(xtra) = paste("X",c(1:p),sep="")
par(mfrow=c(4,5), pch=20, mar=c(1,1,1,1))
for(j in 1:p){
? plot(xtra[,j], sal, main=paste(j))
}
# the data frame(s):
features = data.frame(edu,yex,ert,xtra)
dat = data.frame(sal,features) # may be more convenient sometimes
# train-test split:
i.train = sample(1:n, size=300, replace=FALSE)
# line up data in several formats for convenience:
dat.train = dat[i.train,]
dat.test = dat[-i.train,]
x.train = features[i.train,]
y.train = sal[i.train]
x.test = features[-i.train,]
y.test = sal[-i.train]
# not forgetting matrix forms for the likes of glmnet:
xm = model.matrix(sal~.,data=features)[,-1]
xm.train = xm[i.train,]
xm.test = xm[-i.train,]
#### check out what LASSO would tell us ####
lasso.cv = cv.glmnet(xm.train, y.train)
lasso = glmnet(xm.train, y.train, lambda=lasso.cv$lambda.min)
coef(lasso)
# c.lasso = caret::train(x.train, y.train, method="glmnet")
# how about cross-validating this?
K = 10
folds = cut(1:n, breaks=K, labels=FALSE)
sel = matrix(0, nrow=K, ncol=ncol(features))
colnames(sel) = names(features)
for(k in 1:K){
? itr = which(folds!=k)
? lasso.cv = cv.glmnet(xm[itr,], sal[itr])
? lasso = glmnet(xm[itr,], sal[itr], lambda=lasso.cv$lambda.min)
? isel = which(coef(lasso)[-1] != 0)
? sel[k,isel] = 1
}
apply(sel,2,mean)*100
# LASSO thinks X1 and X14, for example, are important...
# We'd be better off increasing the regularization parameter,
# e.g. using lasso.cv$lambda.min*2 instead (try it!).
#### perform FS with caret::rfe based on linear regression ####
subsets <- c(1:5, 10, 15, 20, ncol(features))
ctrl <- rfeControl(functions = lmFuncs,
? ? ? ? ? ? ? ? ? method = "cv",
? ? ? ? ? ? ? ? ? number = 10,
? ? ? ? ? ? ? ? ? # method = "repeatedcv",
? ? ? ? ? ? ? ? ? # repeats = 5,
? ? ? ? ? ? ? ? ? verbose = FALSE)
lm.rfe <- rfe(x.train, y.train,
? ? ? ? ? ? ? sizes = subsets,
? ? ? ? ? ? ? rfeControl = ctrl)
lm.rfe
# This function has picked the correct subset of features
#### compare with leaps...####
reg.bwd = regsubsets(sal~., data=dat.train,
? ? ? ? ? ? ? ? ? ? nvmax=ncol(features))
opt.mod = which.max(summary(reg.bwd)$adjr2)
isel = which(summary(reg.bwd)$which[opt.mod,-1])
isel
# how about cross-validating this?
K = 10
folds = cut(1:n, breaks=K, labels=FALSE)
sel = matrix(0, nrow=K, ncol=ncol(features))
colnames(sel) = names(features)
for(k in 1:K){
? itr = which(folds!=k)
? reg.bwd = regsubsets(sal~., data=dat[itr,],
? ? ? ? ? ? ? ? ? ? ? nvmax=ncol(features))
? opt.mod = which.max(summary(reg.bwd)$adjr2)
? isel = which(summary(reg.bwd)$which[opt.mod,-1])
? sel[k,isel] = 1
}
apply(sel,2,mean)*100
# X1 and X14, again...
#### perform FS with caret::rfe based on RF ####
subsets <- c(1:5, 10, 15, 20, ncol(features))
ctrl <- rfeControl(functions = rfFuncs,
? ? ? ? ? ? ? ? ? method = "cv",
? ? ? ? ? ? ? ? ? number = 10,
? ? ? ? ? ? ? ? ? # method = "repeatedcv",
? ? ? ? ? ? ? ? ? # repeats = 5,
? ? ? ? ? ? ? ? ? verbose = FALSE)
rf.rfe <- rfe(x.train, y.train,
? ? ? ? ? ? ? sizes = subsets,
? ? ? ? ? ? ? rfeControl = ctrl)
rf.rfe
# worth the wait!
# ST4061 / ST6041
# 2021-2022
# Eric Wolsztynski
# ...
#### Section 8: Unsupervised learning Use(s) of PCA...####
# Use(s) of PCA...
#
#主成分分析(Principal Component Analysis,PCA), 是一種統(tǒng)計(jì)方法。只做分類,根據(jù)幾個(gè)X決定Y是哪一類
#通過正交變換將一組可能存在相關(guān)性的變量轉(zhuǎn)換為一組線性不相關(guān)的變量,轉(zhuǎn)換后的這組變量叫主成分
#k均值聚類算法(k-means clustering algorithm)是一種迭代求解的聚類分析算法,
#其步驟是,預(yù)將數(shù)據(jù)分為K組,則隨機(jī)選取K個(gè)對(duì)象作為初始的聚類中心,然后計(jì)算每個(gè)對(duì)象與各個(gè)種子聚類中心之間的距離,
#把每個(gè)對(duì)象分配給距離它最近的聚類中心。聚類中心以及分配給它們的對(duì)象就代表一個(gè)聚類。每分配一個(gè)樣本,
#聚類的聚類中心會(huì)根據(jù)聚類中現(xiàn)有的對(duì)象被重新計(jì)算。這個(gè)過程將不斷重復(fù)直到滿足某個(gè)終止條件。
#終止條件可以是沒有(或最小數(shù)目)對(duì)象被重新分配給不同的聚類,沒有(或最小數(shù)目)聚類中心再發(fā)生變化,誤差平方和局部最小。
# Section 8: Unsupervised learning
# Use(s) of PCA...
#PCA的原理:don't make decisions based on y but on x's in particular
#PCA的目的:filter out some variables in the original domain.
#PCA的缺點(diǎn):your features become compulsive features of all input features
#PCA里為什么要scale:
#after scaling,the matrix formed by all x becomes correlation matrix instead of covariance Matrix
#redistribute the information as to maximise the variance
#rearranging data so as to extract most of the information early on.
#pca里的數(shù),及他所占的百分比是怎么來的
#for the matrix formed by all x, every xi has its own enginvalue lambda i,
#the its value is sqrt(lambda i) / sum(sqrt(lambda i))
#因此,我們應(yīng)該通過標(biāo)準(zhǔn)化來取消數(shù)據(jù)的大小對(duì)其對(duì)應(yīng)的特征值和PCA的影響
#install.packages('fdm2id')
# Section 8: Unsupervised learning
# Use(s) of PCA...
rm(list=ls())
library(caret)
library(pROC)
if(1){
? # library(mlbench) #also contains a version of the Ionosphere dataset
? # head(Ionosphere)
? library(fdm2id)
? head(ionosphere)#顯示數(shù)據(jù)的前幾行數(shù)(不知道有幾行)
? dat = ionosphere
? class(dat[,1])
? class(dat[,ncol(dat)])
} else {
? require(mlbench)
? data(Sonar)
? dat = Sonar
}
#要符合PCA的命名邏輯
names(dat) = c(paste("X",1:(ncol(dat)-1),sep=''), "Y")#重命名每列數(shù)據(jù),將其變?yōu)閄,Y
head(dat)
y = dat[,ncol(dat)]
x = dat[,-ncol(dat)]
n = nrow(x)
p = ncol(x)
table(y)/sum(table(y))#看數(shù)據(jù)里的每種y的占比
apply(x,2,sd)
sapply(x,sd)
sapply(x,mean)
M = cor(x)#correlation
diag(M) = 0
which(abs(M)>.9)
caret::findCorrelation(cor(x))
set.seed(6041)
# itrain = sample(1:n, round(.7*n))
# do stratified sampling instead:分層抽樣
itrain = createDataPartition(y, p=.7, times=1)[[1]] #do stratified sampling #做分層抽樣#結(jié)果和上面那個(gè)一樣
dtrain = dat[itrain,]
dtest = dat[-itrain,]
# (1) Using logistic regression for prediction...
logo = glm(Y~., dat=dtrain, family='binomial')#想看看數(shù)據(jù)的logistic regression,沒啥用
summary(logo)
logp = predict(logo, newdata=dtest, "response")
summary(logp)
logh = as.factor( levels(y)[(logp>.5)+1] )
# table(logh, dtest$Y)
caret::confusionMatrix(logh, dtest$Y)
roco = pROC::roc(predictor=logp, response=dtest$Y)
roco$auc
ci.auc(roco)# 95% CI of roco$auc
# boxplot(dat$X2~dat$Y)
# (2) Using PCA for prediction...
pca = prcomp(x, scale=TRUE)
plot(pca)
plot(x[,2:3], pch=20)#隨便看看兩個(gè)數(shù)據(jù)的關(guān)系
biplot(pca, col=c(1,2))#有原始數(shù)據(jù),有向量
biplot(pca, col=c(0,2))#只有向量,沒有原始數(shù)據(jù)
abline(h=0, v=0, col=8, lty=1)#畫坐標(biāo)軸
j = which(summary(pca)$importance[3,]>.85)[1]####只要PCA里前85%的數(shù)據(jù)####
names(pca)
xt = pca$x[,1:j]
# create reduced data frame
pca.dat = data.frame(xt,Y=y)
pca.dtrain = pca.dat[itrain,]
pca.dtest = pca.dat[-itrain,]
#把前85%的數(shù)據(jù)取出來,在做logistics regression
pca.logo = glm(Y~., dat=pca.dtrain, family='binomial')
summary(pca.logo)
pca.logp = predict(pca.logo, newdata=pca.dtest, "response")
pca.logh = as.factor( levels(y)[(pca.logp>.5)+1] )
caret::confusionMatrix(logh, dtest$Y)#PAC前的數(shù)據(jù)#Accuracy : 0.8173
caret::confusionMatrix(pca.logh, dtest$Y)#Accuracy : 0.7885 #沒有變低很多
roco = pROC::roc(predictor=logp, response=dtest$Y)#0.7856
pca.roco = pROC::roc(predictor=pca.logp, response=dtest$Y)#0.8302
roco$auc
pca.roco$auc
ci.auc(roco)
ci.auc(pca.roco)
################################說用caret包,沒說干啥,也不分析
set.seed(6041)
trC = trainControl(method="boot", number=10,
? ? ? ? ? ? ? ? ? # method="cv", number=10,repeat(重復(fù)幾遍)
? ? ? ? ? ? ? ? ? savePredictions = "all",
? ? ? ? ? ? ? ? ? classProbs = TRUE)#bootstrapping 也可以cv
modo = caret::train(x, y, family='binomial', method="glm", trControl=trC)
modo$results
pca.modo = caret::train(xt, y, family='binomial', method="glm", trControl=trC)
pca.modo$results
extract.caret.metrics <- function(co){
? # co: caret output
? ids = unique(co$pred$Resample)
? K = length(ids)
? aucs = accs = numeric(K)
? for(k in 1:K){
? ? sk = subset(co$pred,Resample==ids[k])
? ? l = co$levels[1]
? ? aucs[k] = roc(response=sk$obs, pred=sk[,l], quiet=TRUE)$auc
? ? tb = table(sk$obs, sk$pred)
? ? accs[k] = sum(diag(tb))/sum(tb)
? }
? return(list(aucs=aucs, accs=accs))
}
# compare CV AUCs and accuracies:
eo = extract.caret.metrics(modo)
c(mean(eo$aucs), mean(eo$accs))#PCA前AUC和accuracy
pca.eo = extract.caret.metrics(pca.modo)
c(mean(pca.eo$aucs), mean(pca.eo$accs))#PCA后AUC和accuracy
# reporting mean and SE instead:
c(mean(eo$aucs),sd(eo$aucs))#PCA前AUC和accuracy
c(mean(pca.eo$aucs),sd(pca.eo$aucs))#PCA后AUC和accuracy
################################說用caret包,沒說干啥,也不分析
# (2) Using PCA to cluster features...聚類
biplot(pca, col=c(0,2))
abline(h=0, v=0, col=8, lty=1)
# reproduce plot of projeted features:
pca.biplot <- function(pca,cols=2,a=1,b=2){
? plot(pca$rotation[,c(a,b)],pch='')
? abline(h=0,lty=3,col=1,lwd=.5)
? abline(v=0,lty=3,col=1,lwd=.5)
? nms = row.names(pca$rotation)
? text(pca$rotation[,c(a,b)],labels=nms,col=cols)
}
pca.biplot(pca,cols=2,a=1,b=2)#只要向量的點(diǎn)上有向量名字,不要向量線
#loading plot(載荷)
#里面的數(shù)表示它作為第i主成分時(shí),其對(duì)于平均水平的相對(duì)程度,正數(shù)表示平均水平以上,負(fù)數(shù)表示平均水平以下
abline(h=0, v=0, col=8, lty=1)
#如果兩個(gè)名字太近,don't need them all together,we can cluster on that.聚類
C = 5 #被選作集群的點(diǎn)數(shù) center,也是k
C = 10
x.proj = pca$rotation[,1:2]
#ko = kmeans(pca$rotation[,1:2], centers=5)
# run on first two components 要前兩個(gè)成分
#take five clusters 采取五個(gè)集群
ko = kmeans(x.proj, centers=C)
points(ko$centers, pch=20, cex=3)#看哪C個(gè)點(diǎn)被選作集群(選擇是隨機(jī)的)
#how they group up in the projected space, do clustering.
cbind(ko$cluster)#看每個(gè)點(diǎn)都被分為了哪個(gè)集群
table(ko$cluster)#每個(gè)集群都有幾個(gè)點(diǎn)
# pay attention what you cluster though! 所有PCA都要
ko.all = kmeans(pca$rotation, centers=C)
points(ko.all$centers[,1:2], pch=15, cex=3, col=4)
cbind(ko$cluster, ko.all$cluster) # watch out, cluster numbers are random!
#see the number of clusters OK for each cluster,picking identifying which features are contained in the cluster.
pca.clust.set = numeric(C)
#looking at which picture is the closest to the center
for(ic in 1:C){
? fs = which(ko$cluster==ic)
? centroid = ko$centers[ic,]
? dc = NULL
? for(ifs in fs){
? ? dc = c(dc, sqrt(mean((x.proj[ifs,]-centroid)^2)))
? }
? pca.clust.set[ic] = fs[which.min(dc)]#the closest feature from the centre for each cluster
}
x.ss = x[,pca.clust.set]
set.seed(6041)
pca.cl.modo = caret::train(x.ss, y, family='binomial', method="glm", trControl=trC)
pca.cl.modo$results
pca.cl.eo = extract.caret.metrics(pca.cl.modo)
c(mean(eo$aucs), mean(eo$accs))#auc 和 accuracy
c(mean(pca.eo$aucs), mean(pca.eo$accs))
c(mean(pca.cl.eo$aucs), mean(pca.cl.eo$accs))
# reporting mean and SE instead:
c(mean(eo$aucs),sd(eo$aucs))#AUC及其sd
c(mean(pca.eo$aucs),sd(pca.eo$aucs))
c(mean(pca.cl.eo$aucs),sd(pca.cl.eo$aucs))
#### Another example...in PCA####
library(ISLR)
dim(Hitters)
dat = na.omit(Hitters)
n = nrow(dat)
set.seed(4060)
dat = dat[sample(1:n, n, replace=FALSE),]
i.train = 1:round(.7*n)
dat.train = dat[i.train,]
dat.valid = dat[-i.train,]#test group
# (a) Scaled PCA:
#這里有很多x,這里只選取前6個(gè)
set.seed(1)
pca = prcomp(dat.train[,1:6], scale=TRUE)
summary(pca)
# 4 PCs are necessary to capture over 86% of the information
# (b) k-means:
set.seed(1)
ko = kmeans(dat.train[,1:6], 4)#4個(gè)集群
table(ko$cluster)
par(font=2, font.axis=2, font.lab=2)
dims = c(1,3) #用x1和x3作圖,這里隨便誰都行
plot(dat.train[,dims], pch=20,? col=c(2,4,3,5)[ko$cluster], cex=2)
#對(duì)上面的集群作圖
points(ko$centers[,dims], pch='*',? col=1, cex=4)
#這里沒解釋這四個(gè)點(diǎn)標(biāo)出來是干什么的
#根據(jù)圖感覺應(yīng)該是每個(gè)集群的中心點(diǎn)
#can see clearly that there's 4 groups
#
# From figure:
# Stars indicate the centroids of each cluster
# AtBat contributes more because of scale effect:
# 從圖中:
# 星號(hào)表示每個(gè)簇的質(zhì)心
# 由于規(guī)模效應(yīng),AtBat 貢獻(xiàn)更大:(AtBat的sd最大)
sapply(dat.train[,1:6],sd)
# (b) k-means:
set.seed(1)
#對(duì)PCA進(jìn)行k-means
pca.ko = kmeans(pca$x, 4)
table(pca.ko$cluster)
par(font=2, font.axis=2, font.lab=2)
dims = c(1,3)
plot(dat.train[,dims], pch=20,? col=c(2,4,3,5)[pca.ko$cluster], cex=2)
#和之前的圖差別挺大的,也沒分析為啥
###1:02講完了
#### Section 8: miz Imbalance...####
# Imbalance...
#imbalance 就是當(dāng)數(shù)據(jù)里絕大部分都是A,只有很少是B時(shí)
#比如No占0.9667,而 Yes 只有0.0333
#很多Yes被識(shí)別為No
#Sensitivity和Specificity中有一個(gè)數(shù)會(huì)非常小
#處理imbalance(對(duì)數(shù)據(jù)進(jìn)行加權(quán),讓yes的比例變大) 以后
#準(zhǔn)確率下降了,但Yes的識(shí)別率顯著提高
#Sensitivity和Specificity都很大
#
#Confusion Matrix and Statistics
#Reference
#Prediction? No? Yes
#No? ? ? ? ? TN? FN
#Yes? ? ? ? ? FP? TP
#True/FALSE
#Positive/Negative
#FP:failed alarm
#FN:false detection
#Sensitivity:TP/(TP+FN)
#Specificity=TN/(TN+FP)
rm(list=ls())
library(ISLR)
head(Default)
tb = table(Default$default)
tb/sum(tb)
dat = Default
set.seed(4061)
itrain = createDataPartition(dat$default, p=.7, times=1)[[1]]
dtrain = dat[itrain,]
dtest = dat[-itrain,]
trC = trainControl(method="boot", number=50,
? ? ? ? ? ? ? ? ? # method="cv", number=10,
? ? ? ? ? ? ? ? ? savePredictions = "all",
? ? ? ? ? ? ? ? ? classProbs = TRUE)
# (1) Using logistic regression for prediction...
set.seed(6041)
x = dtrain
x$default = NULL
y = dtrain$default
modo = caret::train(x, y, family='binomial', method="glm", trControl=trC)
modo$results
preds = predict(modo, dtest)
confusionMatrix(preds, dtest$default)#很多Yes被識(shí)別為No
#Sensitivity和Specificity中有一個(gè)數(shù)會(huì)非常小
confusionMatrix(preds, dtest$default, positive=levels(dtest$default)[2])
# positive=levels(dtest$default)[2]:? 從positive=yes變成positive=No
#Accuracy和matrix不變,Sensitivity和Specificity數(shù)值置換了
levels(dtest$default)
# (2) Using weighted logistic regression for prediction...
tbt = table(dtrain$default)
ws = rev( as.numeric(tbt/sum(tbt)) )
w = numeric(nrow(dtrain))
l = levels(dtrain$default)
w[which(dtrain$default==l[1])] = ws[1]
w[which(dtrain$default==l[2])] = ws[2]
#modo = caret::train(x, y, family='binomial', method="glm", trControl=trC)
modo.w = caret::train(x, y, family='binomial', method="glm", trControl=trC, weights=w)
modo.w$results
preds.w = predict(modo.w, dtest)
confusionMatrix(preds.w, dtest$default, positive=levels(dtest$default)[2])
#準(zhǔn)確率下降了,但Yes的識(shí)別率顯著提高
#Sensitivity和Specificity都很大