Model Selection

library(ISLR)
## Warning: package 'ISLR' was built under R version 3.1.2
summary(Hitters)
##      AtBat            Hits         HmRun            Runs       
##  Min.   : 16.0   Min.   :  1   Min.   : 0.00   Min.   :  0.00  
##  1st Qu.:255.2   1st Qu.: 64   1st Qu.: 4.00   1st Qu.: 30.25  
##  Median :379.5   Median : 96   Median : 8.00   Median : 48.00  
##  Mean   :380.9   Mean   :101   Mean   :10.77   Mean   : 50.91  
##  3rd Qu.:512.0   3rd Qu.:137   3rd Qu.:16.00   3rd Qu.: 69.00  
##  Max.   :687.0   Max.   :238   Max.   :40.00   Max.   :130.00  
##                                                                
##       RBI             Walks            Years            CAtBat       
##  Min.   :  0.00   Min.   :  0.00   Min.   : 1.000   Min.   :   19.0  
##  1st Qu.: 28.00   1st Qu.: 22.00   1st Qu.: 4.000   1st Qu.:  816.8  
##  Median : 44.00   Median : 35.00   Median : 6.000   Median : 1928.0  
##  Mean   : 48.03   Mean   : 38.74   Mean   : 7.444   Mean   : 2648.7  
##  3rd Qu.: 64.75   3rd Qu.: 53.00   3rd Qu.:11.000   3rd Qu.: 3924.2  
##  Max.   :121.00   Max.   :105.00   Max.   :24.000   Max.   :14053.0  
##                                                                      
##      CHits            CHmRun           CRuns             CRBI        
##  Min.   :   4.0   Min.   :  0.00   Min.   :   1.0   Min.   :   0.00  
##  1st Qu.: 209.0   1st Qu.: 14.00   1st Qu.: 100.2   1st Qu.:  88.75  
##  Median : 508.0   Median : 37.50   Median : 247.0   Median : 220.50  
##  Mean   : 717.6   Mean   : 69.49   Mean   : 358.8   Mean   : 330.12  
##  3rd Qu.:1059.2   3rd Qu.: 90.00   3rd Qu.: 526.2   3rd Qu.: 426.25  
##  Max.   :4256.0   Max.   :548.00   Max.   :2165.0   Max.   :1659.00  
##                                                                      
##      CWalks        League  Division    PutOuts          Assists     
##  Min.   :   0.00   A:175   E:157    Min.   :   0.0   Min.   :  0.0  
##  1st Qu.:  67.25   N:147   W:165    1st Qu.: 109.2   1st Qu.:  7.0  
##  Median : 170.50                    Median : 212.0   Median : 39.5  
##  Mean   : 260.24                    Mean   : 288.9   Mean   :106.9  
##  3rd Qu.: 339.25                    3rd Qu.: 325.0   3rd Qu.:166.0  
##  Max.   :1566.00                    Max.   :1378.0   Max.   :492.0  
##                                                                     
##      Errors          Salary       NewLeague
##  Min.   : 0.00   Min.   :  67.5   A:176    
##  1st Qu.: 3.00   1st Qu.: 190.0   N:146    
##  Median : 6.00   Median : 425.0            
##  Mean   : 8.04   Mean   : 535.9            
##  3rd Qu.:11.00   3rd Qu.: 750.0            
##  Max.   :32.00   Max.   :2460.0            
##                  NA's   :59
There are some missing values here,so before we proceed we will remove them:
Hitters<- span=""> na.omit(Hitters)
with(Hitters,sum(is.na(Salary)))
## [1] 0

Best Subset regression

We will now use the package leaps to evaluate all the best-subset models.
library(leaps)
regfit.full<- span=""> regsubsets(Salary~.,data=Hitters)
summary(regfit.full)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters)
## 19 Variables  (and intercept)
##            Forced in Forced out
## AtBat          FALSE      FALSE
## Hits           FALSE      FALSE
## HmRun          FALSE      FALSE
## Runs           FALSE      FALSE
## RBI            FALSE      FALSE
## Walks          FALSE      FALSE
## Years          FALSE      FALSE
## CAtBat         FALSE      FALSE
## CHits          FALSE      FALSE
## CHmRun         FALSE      FALSE
## CRuns          FALSE      FALSE
## CRBI           FALSE      FALSE
## CWalks         FALSE      FALSE
## LeagueN        FALSE      FALSE
## DivisionW      FALSE      FALSE
## PutOuts        FALSE      FALSE
## Assists        FALSE      FALSE
## Errors         FALSE      FALSE
## NewLeagueN     FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns
## 1  ( 1 ) " "   " "  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 2  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 3  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 4  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 5  ( 1 ) "*"   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 6  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "  
## 7  ( 1 ) " "   "*"  " "   " "  " " "*"   " "   "*"    "*"   "*"    " "  
## 8  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   " "    " "   "*"    "*"  
##          CRBI CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1  ( 1 ) "*"  " "    " "     " "       " "     " "     " "    " "       
## 2  ( 1 ) "*"  " "    " "     " "       " "     " "     " "    " "       
## 3  ( 1 ) "*"  " "    " "     " "       "*"     " "     " "    " "       
## 4  ( 1 ) "*"  " "    " "     "*"       "*"     " "     " "    " "       
## 5  ( 1 ) "*"  " "    " "     "*"       "*"     " "     " "    " "       
## 6  ( 1 ) "*"  " "    " "     "*"       "*"     " "     " "    " "       
## 7  ( 1 ) " "  " "    " "     "*"       "*"     " "     " "    " "       
## 8  ( 1 ) " "  "*"    " "     "*"       "*"     " "     " "    " "
It gives by default best-subsets up to size 8; lets increase that to 19, i.e. all the variables
regfit.full<- span=""> regsubsets(Salary~.,data=Hitters,nvmax=19)
reg.summary<- span=""> summary(regfit.full)
names(reg.summary)
## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"
plot(reg.summary$cp,xlab='Number of Variables',ylab='Cp')
which.min(reg.summary$cp)
## [1] 10
points(10,reg.summary$cp[10],pch=20,col='red')
 There is a plot method for the regsubsetsobject
plot(regfit.full,scale='Cp')
coef(regfit.full,10)
##  (Intercept)        AtBat         Hits        Walks       CAtBat 
##  162.5354420   -2.1686501    6.9180175    5.7732246   -0.1300798 
##        CRuns         CRBI       CWalks    DivisionW      PutOuts 
##    1.4082490    0.7743122   -0.8308264 -112.3800575    0.2973726 
##      Assists 
##    0.2831680

Resampling Method

require(ISLR)
## Loading required package: ISLR
## Warning: package 'ISLR' was built under R version 3.1.2
#install.packages('boot')
require(boot)
## Loading required package: boot
## Warning: package 'boot' was built under R version 3.1.2
?cv.glm
## starting httpd help server ... done
plot(mpg~horsepower,data=Auto)
##LOOGV
glm.fit<- span=""> glm(mpg~horsepower,data=Auto)
cv.glm(Auto,glm.fit)$delta #pretty slow (doesnt use formula (5.2) on page 180)
## [1] 24.23151 24.23114
##Lets write a simple function to use formula (5.2)
loocv<- span=""> function(fit){
h<- span=""> lm.influence(fit)$h
mean((residuals(fit)/(1-h))^2)
}
## Now we try it out
loocv(glm.fit)
## [1] 24.23151
cv.error<- span=""> rep(0,5)
degree<- span=""> 1:5
for(d in degree){
glm.fit<- span=""> glm(mpg~poly(horsepower,d),data=Auto)
cv.error[d]<- span=""> loocv(glm.fit)
}
plot(degree,cv.error,type='b')
## 10-fold CV
cv.error10<- span=""> rep(0,5)
for(d in degree){
glm.fit<- span=""> glm(mpg~poly(horsepower,d),data=Auto)
cv.error10[d]<- span=""> cv.glm(Auto,glm.fit,K=10)$delta[1]
}
lines(degree,cv.error10,type='b',col='red')
## Bootstrap
## Minimum risk investment - Section 5.2
alpha<- span=""> function(x,y){
vx<- span=""> var(x)
vy<- span=""> var(y)
cxy<- span=""> cov(x,y)
(vy-cxy)/(vx+vy-2+cxy)
}
alpha(Portfolio$X,Portfolio$Y)
## [1] 0.6413231
## What is the standard error of alpha?
alpha.fn<- span=""> function(data,index){
with(data[index,],alpha(X,Y))
}
alpha.fn(Portfolio,1:100)
## [1] 0.6413231
set.seed(1)
alpha.fn(Portfolio,sample(1:100,100,replace=T))
## [1] 0.6727862
boot.out<- span=""> boot(Portfolio,alpha.fn,R=1000)
boot.out
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Portfolio, statistic = alpha.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##      original     bias    std. error
## t1* 0.6413231 0.08995096   0.4226578
plot(boot.out)

Nikkei225

28000-28550 up in the early session, down lately.