library(tm) #パッケージtmをインストールの上読み込む


rawtext <- read.csv("test_sentences.csv",stringsAsFactors=F,header=F)[,1]
rawtext


sentences <- VCorpus(VectorSource(rawtext))


sentences[[1]]$content


sent.clean <- tm_map(sentences, content_transformer(tolower))
sent.clean <- tm_map(sent.clean, removeNumbers)
sent.clean <- tm_map(sent.clean, removeWords, stopwords("en"))
sent.clean <- tm_map(sent.clean, removePunctuation)
sent.clean <- tm_map(sent.clean, stemDocument)
sent.clean <- tm_map(sent.clean, stripWhitespace)


sent.clean[[1]]$content; sentences[[1]]$content


bow <- DocumentTermMatrix(sent.clean,control=list(wordLengths=c(2,20)))


as.matrix(bow)


rbow <- bow[,findFreqTerms(bow,2)]


as.matrix(rbow)


result <- svd(bow)
str(result) #中身を確認

List of 3
 $ d: num [1:8] 3.02 2.28 2 1.73 1.57 ...
 $ u: num [1:8, 1:8] 3.95e-18 -8.58e-17 3.33e-17 -4.12e-17 -5.72e-16 ...
 $ v: num [1:23, 1:8] 1.72e-16 -2.75e-01 -4.29e-16 1.40e-16 2.80e-18 ...


result$d


U <- as.matrix(result$u)
D <- diag(result$d)
V <- as.matrix(result$v)


ap.bow <- U[,1:2] %*% D[1:2,1:2] %*% t(V[,1:2])  #k=2とした近似
sum(abs(bow-ap.bow))  ##近似の精度


ap.bow <- U[,1:5] %*% D[1:5,1:5] %*% t(V[,1:5])  #k=5とした近似
sum(abs(bow-ap.bow))  ##近似の精度


ap.bow <- U[,1:8] %*% D[1:8,1:8] %*% t(V[,1:8])  #k=rとした近似
sum(abs(bow-ap.bow))  ##近似の精度


rownames(V) <- colnames(bow)  #名前を付ける


round(V[,1],3)  ##harriとpotterの絶対値が大きい
round(V[,2],3)  ##lukeの絶対値が大きい


plot(D[1,1]*U[,1], D[2,2]*U[,2])


Ppower <- read.csv("Ppower.csv",header=T,row.names=1)
head(Ppower)


xx <- scale(Ppower,scale=T) #scaling


sigma <- t(xx)%*%xx/47
result <- eigen(sigma)
evalue <- result$values
evector <- result$vectors


rownames(evector) <- colnames(xx) #解釈をしやすくするために名前をつける．


round(evector[,1],3)
round(evector[,2],3)


z1 <- xx %*% evector[,1]
z2 <- xx %*% evector[,2]
plot(z1,z2,col="blue",pch=3)
text(z1,z2,row.names(xx),cex=0.9) #点にサンプル名を付与する．


Z <- xx %*% evector[,1:5] #第5主成分スコアまで例えば計算する．
round(apply(Z,2,mean),3) #各主成分スコアの平均値
round(t(Z)%*%Z/47,3) #主成分スコア間の共分散


round(evalue,3)[1:5]


biplot(Z[,1:2],evector[,1:2],ylim=c(-5,9),xlim=c(-25,7),cex=0.85)


round(cumsum(evalue)/sum(evalue),3)  #累積寄与率
plot(1:24,round(cumsum(evalue)/sum(evalue),3))  #そのプロット

	age	boy	dad	destroy	enough	father	get	harri	hey	high	⋯	luke	mani	may	old	pleas	potter	privaci	star	unusu	way
1	0	0	0	0	0	1	0	0	0	0	⋯	1	0	0	0	0	0	0	0	0	0
2	1	0	0	1	0	0	0	0	0	0	⋯	0	0	0	0	0	0	0	1	0	0
3	0	0	0	0	0	0	0	0	0	0	⋯	1	0	1	0	1	0	1	0	0	0
4	0	0	0	0	1	0	0	0	0	0	⋯	1	0	0	1	0	0	0	0	0	0
5	0	0	1	0	0	0	0	0	1	0	⋯	0	0	0	0	0	0	0	0	0	0
6	0	1	0	0	0	0	0	1	0	1	⋯	0	1	0	0	0	1	0	0	1	1
7	0	0	0	0	0	0	1	1	0	0	⋯	0	0	0	0	0	1	0	0	0	1
8	0	0	0	0	0	0	0	1	0	0	⋯	0	0	0	0	0	0	0	0	0	0

	population	households	office	income	nationaltax	localtax	agriculturalout	forestryprod	landing	factory	⋯	deposit	publicexpense	houses	cars	educationalamount	booksales	newspapersales	tv	phone	posting
	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<dbl>	<int>	<int>	⋯	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>
Hokkaido	5650573	2522295	270504	145293	1379098	546638	10579	463.7	761331	10668	⋯	336487	1611254	49183	3720320	945495	177154	2235101	1652081	6154045	650837
Aomori	1479358	551806	74341	32498	242730	131442	2402	88.2	107727	3188	⋯	70174	324170	8971	999912	264053	40213	531167	470546	1384375	114626
Iwate	1405060	488354	72456	34152	224269	123476	2587	193.4	108752	4126	⋯	76438	339411	8906	983234	277808	35185	471524	443942	1300483	114618
Miyagi	2350026	856527	115297	61092	735121	245372	1870	76.1	373037	5782	⋯	125693	391923	19382	1557230	365365	70995	812314	719845	2525483	351549
Akita	1173722	410308	65300	27294	186373	97512	2208	123.2	5599	4130	⋯	56790	328282	6664	831589	214608	30773	442938	385189	1087529	93834
Yamagata	1225990	387732	70523	29851	202270	107894	2349	62.5	4116	5971	⋯	67083	363756	7088	920699	215490	29966	462638	371689	1127332	105540

18.531	0.000	0.000	0.00	0.000
0.000	2.512	0.000	0.00	0.000
0.000	0.000	1.164	0.00	0.000
0.000	0.000	0.000	0.63	0.000
0.000	0.000	0.000	0.00	0.266

R演習 : 主成分分析¶

潜在的意味分析¶

特異値分解(SVD)による行列の近似¶

主成分分析¶

	age	boy	dad	destroy	enough	father	get	harri	hey	high	⋯	luke	mani	may	old	pleas	potter	privaci	star	unusu	way
1	0	0	0	0	0	1	0	0	0	0	⋯	1	0	0	0	0	0	0	0	0	0
2	1	0	0	1	0	0	0	0	0	0	⋯	0	0	0	0	0	0	0	1	0	0
3	0	0	0	0	0	0	0	0	0	0	⋯	1	0	1	0	1	0	1	0	0	0
4	0	0	0	0	1	0	0	0	0	0	⋯	1	0	0	1	0	0	0	0	0	0
5	0	0	1	0	0	0	0	0	1	0	⋯	0	0	0	0	0	0	0	0	0	0
6	0	1	0	0	0	0	0	1	0	1	⋯	0	1	0	0	0	1	0	0	1	1
7	0	0	0	0	0	0	1	1	0	0	⋯	0	0	0	0	0	1	0	0	0	1
8	0	0	0	0	0	0	0	1	0	0	⋯	0	0	0	0	0	0	0	0	0	0

	age	boy	dad	destroy	enough	father	get	harri	hey	high	⋯	luke	mani	may	old	pleas	potter	privaci	star	unusu	way
1	0	0	0	0	0	1	0	0	0	0	⋯	1	0	0	0	0	0	0	0	0	0
2	1	0	0	1	0	0	0	0	0	0	⋯	0	0	0	0	0	0	0	1	0	0
3	0	0	0	0	0	0	0	0	0	0	⋯	1	0	1	0	1	0	1	0	0	0
4	0	0	0	0	1	0	0	0	0	0	⋯	1	0	0	1	0	0	0	0	0	0
5	0	0	1	0	0	0	0	0	1	0	⋯	0	0	0	0	0	0	0	0	0	0
6	0	1	0	0	0	0	0	1	0	1	⋯	0	1	0	0	0	1	0	0	1	1
7	0	0	0	0	0	0	1	1	0	0	⋯	0	0	0	0	0	1	0	0	0	1
8	0	0	0	0	0	0	0	1	0	0	⋯	0	0	0	0	0	0	0	0	0	0

	age	boy	dad	destroy	enough	father	get	harri	hey	high	⋯	luke	mani	may	old	pleas	potter	privaci	star	unusu	way
1	0	0	0	0	0	1	0	0	0	0	⋯	1	0	0	0	0	0	0	0	0	0
2	1	0	0	1	0	0	0	0	0	0	⋯	0	0	0	0	0	0	0	1	0	0
3	0	0	0	0	0	0	0	0	0	0	⋯	1	0	1	0	1	0	1	0	0	0
4	0	0	0	0	1	0	0	0	0	0	⋯	1	0	0	1	0	0	0	0	0	0
5	0	0	1	0	0	0	0	0	1	0	⋯	0	0	0	0	0	0	0	0	0	0
6	0	1	0	0	0	0	0	1	0	1	⋯	0	1	0	0	0	1	0	0	1	1
7	0	0	0	0	0	0	1	1	0	0	⋯	0	0	0	0	0	1	0	0	0	1
8	0	0	0	0	0	0	0	1	0	0	⋯	0	0	0	0	0	0	0	0	0	0