协方差
'''
data.iris=iris[,-5]
data.iris
cov(data.iris)
cor(data.iris)
'''
offer 数据:
data=read.table("d:/offer.txt",header=TRUE,sep="")
offer=data
M=cor(offer)
M
f=function(x,k1,k2){
which(x<=k2 & x>=k1)
}
k1=0.8
k2=1
result=apply(M,2,f,k1,k2)
name=names(result) # Extract name from a list
result$SC
names(result$SC)
c("GSP",names(result$GSP))
offer 主成分分析
data=read.table("d:/offer.txt",header=TRUE,sep="")
offer=data
offer=as.matrix(offer)
meanValue=apply(offer,1,mean)
offer=offer-meanValue
M=cov(offer)
N=eigen(M)
value=N$values
vector=N$vectors
value[1:2]
vector[,1:2]
sum(value[1:4]/sum(value))
plot(value,type="l")
Y=offer%*%(vector[,1:4])
Y
工业数据:
data=read.table("d:/industry.txt",header=TRUE,sep="")
industry=data
M=as.data.frame(industry)
industry_object=princomp(M,cor=T,scores=T)
summary(industry_object,loadings=T)
loadings(industry_object)
scores=industry_object$scores
tatal=apply(scores,1,sum)
N=cbind(scores,tatal)
T=round(N,digits=3)
T
对称矩阵谱分解
data=read.table("d:/industry.txt",header=TRUE,sep="")
industry=data
M=as.data.frame(industry)
N=eigen(cor(M))
values=N$values
vectors=N$vectors
accurate=sum(values[1:3])/sum(values)
M=values[1]*vectors[,1]%*%t(vectors[,1])+
values[2]*vectors[,2]%*%t(vectors[,2])+
values[3]*vectors[,3]%*%t(vectors[,3])
M
因子分析主成分法
data=read.table("C:/offer.txt",header=TRUE,sep="")
M=as.data.frame(data)
C=cor(M)
N=eigen(cor(M))
values=N$values
vectors=N$vectors
accurate=sum(values[1:4])/sum(values)
A=sqrt(values[1:4])*vectors[,1:4]
D=diag(diag(C-A%*%t(A)))
A%*%t(A)-D
因子分析主因子法
data=read.table("d:/offer.txt",header=TRUE,sep="")
R=cor(data)
invR=solve(R)
D=diag(1/diag(invR))
k=0
repeat{
R1=R-D
D1=D
eig=eigen(R1)
val=eig$values
vec=eig$vectors
A=(vec[,1:3])%*%(diag(sqrt(val[1:3])))
D=diag(1-diag(A%*%t(A)))
k=k+1
if (sqrt(sum(D-D1)^2)<1e-5|k>20){break}
}
A
D
因子旋转
data=read.table("d:/offer.txt",header=TRUE,sep="")
R=cor(data)
invR=solve(R)
D=diag(1/diag(invR))
k=0
repeat{
R1=R-D
D1=D
eig=eigen(R1)
val=eig$values
vec=eig$vectors
A=(vec[,1:3])%*%(diag(sqrt(val[1:3])))
D=1-diag(A%*%t(A))
D=diag(D)
k=k+1
if(sqrt(sum(D-D1)^2)<1e-5|k>20){break}
}
print(A)
A.v=varimax(A,eps=1e-5)
print(A.v)
马氏距离判别法
#马氏距离
data=as.matrix(iris[,-5])
sample=data[5,]
s_1=data[1:50,]
s_2=data[51:100,]
miu_1=apply(s_1,2,mean)
miu_2=apply(s_2,2,mean)
cor_1=cov(s_1)
cor_2=cov(s_2)
m_1=(sample-miu_1)%*%solve(cor_1)%*%(sample-miu_1)
m_2=(sample-miu_2)%*%solve(cor_2)%*%(sample-miu_2)
m_1
m_2
#欧氏距离
(sample-miu_1)%*%(sample-miu_1)
(sample-miu_2)%*%(sample-miu_2)
方法二:
n1=nrow(data[1:50,])
n2=nrow(data[51:100,])
n3=nrow(data[101:150,])
s1=(n1-1)*cov(data[1:50,])
s2=(n2-1)*cov(data[51:100,])
s3=(n3-1)*cov(data[101:150,])
sigma=(s1+s2+s3)/(n1+n2+n3-3)
miu1=apply(data[1:50,],2,mean)
miu2=apply(data[51:100,],2,mean)
miu3=apply(data[101:150,],2,mean)
d1=mahalanobis(data,miu1,sigma)
d2=mahalanobis(data,miu2,sigma)
d3=mahalanobis(data,miu3,sigma)
M=matrix(0,3,150)
M[1,]=d1
M[2,]=d2
M[3,]=d3
class=apply(M,2,which.min)
x1=class[1:50]
x2=class[51:100]
x3=class[101:150]
length(x1[x1!=1])+length(x2[x2!=2])+length(x3[x3!=3])
Fisher 判别法
data=as.matrix(iris[,-5])
data=data[1:100,]
miu_1=apply(data[1:50,],2,mean)
miu_2=apply(data[51:100,],2,mean)
sigma=cov(data) #假设总体有相同的协方差阵
# 如果未知总体的协方差阵, 则计算总体协方差的无偏估计
s_1=49*cov(data[1:50,])
s_2=49*cov(data[51:100,])
sigma=(s_1+s_2)/(50+50-2)
# if class>=0, then the sample belong to class 1, else class 2.
f=function(x){class=(miu_1-miu_2)%*%solve(sigma)%*%(2*x-miu_1-miu_2)}
apply(data,1,f)
方法二:
library(MASS)
attach(iris)
ld=lda(Species~Sepal.Length+Sepal.Width+Petal.Length+Petal.Width)
pre=predict(ld)
newclass=pre$class
cbind(Species,newclass,pre$x)
table(newclass,Species)
Bayes 判别
library(MASS)
attach(iris)
ld=lda(Species~Sepal.Length+Sepal.Width+Petal.Length+Petal.Width)
prior=c(1/3,1/3,1/3)
pre=predict(ld,newdata)
newclass=pre$class
cbind(Species,newclass,pre$x)
table(newclass,Species)
例 1
a=c(11.2,14.9,14.3,13.5,16.2,14.3,20.0,21.8,19.0,16.0,11.9,8.7,14.3,10.1,9.1,13.8,15.3,11.0,18.0,10.4,8.2,11.4,11.6,8.4,8.2,10.9,15.6)
b=c(57.25,67.19,67.74,55.63,75.51,57.63,83.94,68.03,78.31,57.11,49.97,30.72,37.65,34.63,56.33,65.23,55.62,55.55,62.85,30.01,29.28,62.88,28.57,30.23,15.96,24.75,21.44)
c=c(13.47,7.89,19.41,20.59,11.06,22.51,15.09,39.42,83.13,12.57,30.70,15.41,12.95,7.68,10.30,4.69,6.06,8.02,6.40,4.61,6.11,5.31,5.08,6.03,8.04,8.34,28.62)
d=c(73.41,73.09,72.33,73.33,72.08,77.35,89.50,71.90,89.45,60.51,69.20,60.29,66.42,62.96,66.01,64.24,54.74,67.47,58.83,60.26,50.71,61.49,68.47,55.55,40.26,46.01,46.01)
a1=c(16.5,20.6,8.6)
b1=c(80.05,81.24,42.06)
c1=c(8.81,5.37,8.88)
d1=c(73.04,60.43,56.37)
data=data.frame(a,b,c,d)
sample=data.frame(a1,b1,c1,d1)
sample=as.matrix(sample)
s_1=data[1:10,]
s_2=data[11:27,]
miu_1=apply(s_1,2,mean)
miu_2=apply(s_2,2,mean)
cor_1=cov(s_1)
cor_2=cov(s_2)
sample=sample[3,]
(sample-miu_1)%*%solve(cor_1)%*%(sample-miu_1)
(sample-miu_2)%*%solve(cor_2)%*%(sample-miu_2)
例 2
library(MASS)
a=c(11.2,14.9,14.3,13.5,16.2,14.3,20.0,21.8,19.0,16.0,11.9,8.7,14.3,10.1,9.1,13.8,15.3,11.0,18.0,10.4,8.2,11.4,11.6,8.4,8.2,10.9,15.6,16.5,20.6,8.6)
b=c(57.25,67.19,67.74,55.63,75.51,57.63,83.94,68.03,78.31,57.11,49.97,30.72,37.65,34.63,56.33,65.23,55.62,55.55,62.85,30.01,29.28,62.88,28.57,30.23,15.96,24.75,21.44,80.05,81.24,42.06)
c=c(13.47,7.89,19.41,20.59,11.06,22.51,15.09,39.42,83.13,12.57,30.70,15.41,12.95,7.68,10.30,4.69,6.06,8.02,6.40,4.61,6.11,5.31,5.08,6.03,8.04,8.34,28.62,8.81,5.37,8.88)
d=c(73.41,73.09,72.33,73.33,72.08,77.35,89.50,71.90,89.45,60.51,69.20,60.29,66.42,62.96,66.01,64.24,54.74,67.47,58.83,60.26,50.71,61.49,68.47,55.55,40.26,46.01,46.01,73.04,60.43,56.37)
e=c(rep(c(0),10),rep(c(1),20))
data=data.frame(a,b,c,d,e)
attach(data)
ld=lda(e~a+b+c+d)
pre=predict(ld)
newclass=pre$class
cbind(e,newclass,pre$x)
table(newclass,e)
例 3
library(MASS)
a=c(11.2,14.9,14.3,13.5,16.2,14.3,20.0,21.8,19.0,16.0,11.9,8.7,14.3,10.1,9.1,13.8,15.3,11.0,18.0,10.4,8.2,11.4,11.6,8.4,8.2,10.9,15.6)
b=c(57.25,67.19,67.74,55.63,75.51,57.63,83.94,68.03,78.31,57.11,49.97,30.72,37.65,34.63,56.33,65.23,55.62,55.55,62.85,30.01,29.28,62.88,28.57,30.23,15.96,24.75,21.44)
c=c(13.47,7.89,19.41,20.59,11.06,22.51,15.09,39.42,83.13,12.57,30.70,15.41,12.95,7.68,10.30,4.69,6.06,8.02,6.40,4.61,6.11,5.31,5.08,6.03,8.04,8.34,28.62)
d=c(73.41,73.09,72.33,73.33,72.08,77.35,89.50,71.90,89.45,60.51,69.20,60.29,66.42,62.96,66.01,64.24,54.74,67.47,58.83,60.26,50.71,61.49,68.47,55.55,40.26,46.01,46.01)
e=c(rep(c(0),10),rep(c(1),17))
a1=c(16.5,20.6,8.6)
b1=c(80.05,81.24,42.06)
c1=c(8.81,5.37,8.88)
d1=c(73.04,60.43,56.37)
e1=c(0,0,0)
data=data.frame(a,b,c,d,e)
newdata=data.frame(a1,b1,c1,d1,e1)
attach(data)
ld=lda(e~a+b+c+d)
prior=c(10/27,17/27)
pre=predict(ld,newdata)
newclass=pre$class
cbind(e,newclass,pre$x)
table(newclass,e)
k-means 聚类
1.
trace=function(M){
miu=apply(M,2,mean)
r=nrow(M)
p=ncol(M)
mur=rep(miu,r)
Mu=matrix(mur,r,p,byrow=TRUE)
tr=sum((M-Mu)^2)
list(mean=miu,trace=tr)
}
y1=c(0,1,2,2,3,4)
y2=c(0,0,0,1,1,1)
df=data.frame(y1,y2)
trace(as.matrix(df[1:5,]))
trace(as.matrix(df[6,]))
2.
data=as.matrix(iris[,-5])
x0=data[3:5,]
dis=NULL
for(i in seq(1,nrow(x0))){
x=rep(x0[i,],150)
M=matrix(x,150,4,byrow=T)
N=apply((data-M)^2,1,sum)
dis=cbind(dis,N)
}
apply(dis,1,which.max)
3.
nnum=150
d.ir=dist(iris[1:num,-5])
d.m=as.matrix(d.ir)
r=0.55
radi=matrix(rep(r,num*num),num,num)
nei=d.m-radi+diag(rep(r,num))
f=function(x){
y=x[x<0]
length(y)
}
n.nei=apply(nei,2,f)
x=nei[,1] # Pick a sample point.
A=x[x<=0]
name.A=names(A)
y=as.numeric(name.A)
y1=as.numeric(names(which(n.nei[y]>=3)))
M=nei[,y1]
g=function(x){which(x<=0)}
lis=apply(M,2,g)
u=NULL
for(i in 1:(length(lis)-1)){
u=union(u,lis[[i]])
}
#u=sort(u)
z=setdiff(u,y)
all(n.nei[z]>=3)
w=vector(len=0)
x=nei[,136]
A=x[x<=0]
name.A=names(A)
g=function(x){
which(x<0)
}
y=as.numeric(name.A)
repeat{
y1=as.numeric(names(which(n.nei[y]>=5)))
if(length(y1)==0) break else{
M=nei[,y1]
lis=apply(M,2,g)
u=lis[[1]]
for(i in 1:(length(lis)-1)){
u=union(u,lis[[i+1]])
}
if(length(w)==length(u)) break else{
w=union(w,u)
y=w
}
}
}
DB-Scan 聚类
num=150
data=iris[1:num,-5]
dis=dist(data)
dis=as.matrix(dis)
cutoff=0.55
cutoffM=matrix(rep(cutoff,num*num),num,num)
dis2=dis-cutoffM+diag(rep(cutoff,num))
f=function(x){
return(length(x[x<0]))
}
rho=apply(dis2,2,f)
delta=NULL
for(i in as.numeric(names(rho))){
denseMoreThanI=rho[rho>rho[i]]
if(length(denseMoreThanI)!=0){
name=as.numeric(names(denseMoreThanI))
deltaI=min(dis[i,name])
delta=c(delta,deltaI)
}else{
delta=c(delta,max(dis[i,]))
}
}
plot(rho,delta)