adv <- read.csv("Advertising.csv",header=T)
##adv <- read.csv("Advertising.csv",header=T,fileEncoding="Shift_JIS") for windows?
adv <- adv[,-1] ##remove first column
head(adv)


result <- lm(sales~TV,data=adv)
summary(result)

Call:
lm(formula = sales ~ TV, data = adv)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.3860 -1.9545 -0.1913  2.0671  7.2124 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 7.032594   0.457843   15.36   <2e-16 ***
TV          0.047537   0.002691   17.67   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.259 on 198 degrees of freedom
Multiple R-squared:  0.6119,	Adjusted R-squared:  0.6099 
F-statistic: 312.1 on 1 and 198 DF,  p-value: < 2.2e-16


coef(result)


boxplot(adv[,"sales"] - predict(result))


result <- lm(sales~TV+radio+newspaper,data=adv)
summary(result)

Call:
lm(formula = sales ~ TV + radio + newspaper, data = adv)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.8277 -0.8908  0.2418  1.1893  2.8292 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  2.938889   0.311908   9.422   <2e-16 ***
TV           0.045765   0.001395  32.809   <2e-16 ***
radio        0.188530   0.008611  21.893   <2e-16 ***
newspaper   -0.001037   0.005871  -0.177     0.86    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.686 on 196 degrees of freedom
Multiple R-squared:  0.8972,	Adjusted R-squared:  0.8956 
F-statistic: 570.3 on 3 and 196 DF,  p-value: < 2.2e-16


vert.data <- read.csv("vert_data.csv")[,-1]
x <- vert.data[,1]; y <- vert.data[,2]
plot(x,y,xlim=c(0,2.1),ylim=c(-0.5,13.5))


poly3 <- as.data.frame(cbind(y,x,x^2,x^3)) #k=3
poly5 <- as.data.frame(cbind(y,x,x^2,x^3,x^4,x^5)) #K=5
poly9 <- as.data.frame(cbind(y,x,x^2,x^3,x^4,x^5,x^6,x^7,x^8,x^9)) #k=9
lmpoly3 <- lm(y~.,data=poly3); lmpoly5 <- lm(y~.,data=poly5);lmpoly9 <- lm(y~.,data=poly9)
cf3 <- coef(lmpoly3); cf5 <- coef(lmpoly5); cf9 <- coef(lmpoly9)

xlim <- c(0,2.1); ylim <- c(-0.5,13.5)
plot(x,y,xlim=xlim,ylim=ylim)
par(new=T)
curve(cf3[1] + cf3[2]*x + cf3[3]*x^2 + cf3[4]*x^3,
      xlim=xlim,ylim=ylim,xlab="",ylab="",col=2,lwd=1.5)
par(new=T)
curve(cf5[1] + cf5[2]*x + cf5[3]*x^2 + cf5[4]*x^3 + cf5[5]*x^4 + cf5[6]*x^5,
      xlim=xlim,ylim=ylim,xlab="",ylab="",col=3,lwd=1.5)
par(new=T)
curve(cf9[1] + cf9[2]*x + cf9[3]*x^2 + cf9[4]*x^3 + cf9[5]*x^4 + cf9[6]*x^5 + cf9[7]*x^6 +cf9[8]*x^7 + cf9[9]*x^8 + cf9[10]*x^9,
      xlim=xlim,ylim=ylim,xlab="",ylab="",col=4,lwd=1.5)
legend("topleft",legend=c("k=3","k=5","k=9"),lty=1,col=c(2,3,4),lwd=1.5,)


library(MASS)
head(Boston)


nt <- 100 ##number of test sample
test.boston <- Boston[1:nt,] ##set first nt rows as test sample
train.boston <- Boston[(nt+1):506,]


lm1 <- lm(medv~crim,data=train.boston)
cf1 <- coef(lm1)
yhat1 <- cf1[1] + cf1[2] * test.boston[,"crim"]
#predict(lm1,test.boston)でも可能
sum((test.boston[,"medv"] - yhat1)^2)/nt


Auto <- read.csv("Auto.csv",header=T)[,-1]
x <- log(Auto[,"horsepower"])
y <- Auto[,"mpg"]
plot(x,y)


X <- as.data.frame(cbind(y,x,x^2,x^3,x^4,x^5))
n <- dim(X)[1]

##8-fold cross-validation since n=392
fold.n <- n/8
accuracy <- rep(0,5)
for(i in 1:5){
  error <- rep(0,8)
  data <- X[,1:(i+1)]
  for(D in 0:7){
    test.data <- 
    train.data <- 
    result <- lm(y~.,data=train.data)
    yhat <- 
    error[D+1] <- sum((test.data[,1] - yhat)^2)/fold.n
  }
  accuracy[i] <- mean(error)
}
which.min(accuracy)

	TV	radio	newspaper	sales
	<dbl>	<dbl>	<dbl>	<dbl>
1	230.1	37.8	69.2	22.1
2	44.5	39.3	45.1	10.4
3	17.2	45.9	69.3	9.3
4	151.5	41.3	58.5	18.5
5	180.8	10.8	58.4	12.9
6	8.7	48.9	75.0	7.2

	crim	zn	indus	chas	nox	rm	age	dis	rad	tax	ptratio	black	lstat	medv
	<dbl>	<dbl>	<dbl>	<int>	<dbl>	<dbl>	<dbl>	<dbl>	<int>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
1	0.00632	18	2.31	0	0.538	6.575	65.2	4.0900	1	296	15.3	396.90	4.98	24.0
2	0.02731	0	7.07	0	0.469	6.421	78.9	4.9671	2	242	17.8	396.90	9.14	21.6
3	0.02729	0	7.07	0	0.469	7.185	61.1	4.9671	2	242	17.8	392.83	4.03	34.7
4	0.03237	0	2.18	0	0.458	6.998	45.8	6.0622	3	222	18.7	394.63	2.94	33.4
5	0.06905	0	2.18	0	0.458	7.147	54.2	6.0622	3	222	18.7	396.90	5.33	36.2
6	0.02985	0	2.18	0	0.458	6.430	58.7	6.0622	3	222	18.7	394.12	5.21	28.7

R演習 : 線形回帰分析¶

データを眺める¶

単回帰分析¶

残差のBoxplot¶

多変量線形回帰分析¶

性能評価¶

トレーニングデータとテストデータ¶

クロスバリデーション¶