
###Phylogenetic Principle Component Analysis

#FILE: Macaulay_BirdCoM_pPCA.R

#CITATION: Macaulay, S. et al. XXXX. Decoupling body shape and mass distribution in birds and their dinosaurian ancestors

#CODE CREDITS: Modified from Bishop et al., 2020 by Karl Bates & Tatjana Hoehfurtner

#DESCRIPTION: Code to conduct the pPCA on the normalised heterogenous segment masses and normalised segment lengths. Note
#             that further analysis was done the PCA scores (see main text), though this is not performed here.

###Load Dependent Packages
library(ape)
library(xlsx)
library(geiger)
library(phytools)
library(ggplot2)

###Load Tree & Exclude Basal Taxa
tree=read.tree("Macaulay_BirdCoM_PCM_Tree.tre")
tax.list.to.go <- c("Iguana_iguana", "Melanosuchus_niger", "Crocodylus_johnstoni")
tree <- drop.tip(tree, tree$tip.label[match(tax.list.to.go, tree$tip.label)])

###Function which is used to ensure PC axes are always the same way round
foo<-function(x) sign(x[which(abs(x)==max(abs(x)))])

###Segment Mass Analysis

#Variables included in analysis
#"DVnorm"		"CCnorm"		"Head_Mass"		"Neck_Mass"
#"Torso_Mass"	"Hindlimb_Mass"	"Forelimb_Mass"	"Tail_Mass"

###Load and Prepare Data
Data <- read.xlsx("Macaulay_BirdCoM_PCM_Dataset.xlsx", sheetIndex = 2, row.names = 1)
Data[Data == "NA"] <- 0.001
Data <- Data[!(row.names(Data) %in% tax.list.to.go), ]
Data<-Data[,c(3:7,15:17)] #Select only relevant variables
Data[[8]] <- as.numeric(Data[[8]]) #read.xlsx does not allow "NA" specification and imports column as "character" class, this line converts back to "numeric"
name.check(tree,Data)

###Run phylogenetic PCA
pca.res.norm=phyl.pca(tree,Data,method="BM",mode="corr")
Eigenvalues.norm=pca.res.norm$Eval
Scores.norm=matrix(rep(1,length(pca.res.norm$S[,1])),length(pca.res.norm$S[,1]),1)%*%apply(pca.res.norm$S,2,foo)*pca.res.norm$S
Loadings.norm=pca.res.norm$L
Loadings.norm[,which(colSums(Scores.norm == pca.res.norm$S)==0)]= -1 * pca.res.norm$L[,which(colSums(Scores.norm == pca.res.norm$S)==0)]
Eigenvectors.norm=pca.res.norm$Evec
Eigenvectors.norm[,which(colSums(Scores.norm == pca.res.norm$S)==0)]= -1 * pca.res.norm$Evec[,which(colSums(Scores.norm == pca.res.norm$S)==0)]

###Plot first two principal components against each other
biplot(pca.res.norm,choices=c(1,2), cex=0.75)

###Calculate variance explained by each principal component
#list of matrices to contain percent variance
PercentVariance.norm=matrix(NA,nrow=ncol(Data), ncol=1)
Total.norm=sum(diag(Eigenvalues.norm))
for (i in 1:ncol(Data)){
  PercentVariance.norm[i]=(Eigenvalues.norm[i,i]/Total.norm)*100
}

###generate graph of first two PC axes coloured by Locomotor Mode 
scores2.norm<-data.frame(Scores.norm)
#scores2.norm$LocomotorMode<-hetero.norm$LocomotorMode
norm<-ggplot(scores2.norm,aes(PC1,PC2))+
  geom_point(aes())+#colour=LocomotorMode)
  theme_classic()+
  ggtitle("Normalised - with DV and CC")
norm+geom_text(label=rownames(scores2.norm), hjust = "inward", size=3)
ggsave("C:/Users/SamCross/Desktop/DinoCoM/ForSam/Master/SamRerun/Neat/hetero_norm_with_DV&CC_labelled.png",width = 10, height = 10, units = "in")

###Write PC scores to a spreadsheet
write.xlsx(Scores.norm, "Hetero_PC_Scores.xlsx", sheetName = "Norm_w_DC&CC", col.names = T, row.names = T)
write.xlsx(Loadings.norm, "Hetero_PC_Scores.xlsx", sheetName = "Norm_w_DC&CC_Load", col.names = T, row.names = T, append = T)
write.xlsx(PercentVariance.norm, "Hetero_PC_Scores.xlsx", sheetName = "Norm_w_DC&CC_%Var", col.names = T, row.names = T, append = T)

###Linear Measurement Analysis

#Variables included in analysis
#"DVnorm_het" "CCnorm_het" "PelvicWidth" "PelvicLength"
#"ShoulderWidth" "NeckLength" "SkullLength" "HindlimbLength"
#"ForelimbLength" "GALength" "TailLength"

###Load and Prepare Data
Data <- read.xlsx("Data_Master.xlsx", sheetIndex = 3, row.names = 1)
Data[Data == "NA"] <- 0.001
Data <- Data[!(row.names(Data) %in% tax.list.to.go), ]
Data<-Data[,c(3:4,7:15)] #Select only relevant variables
name.check(tree,Data)

###Run phylogenetic PCA
pca.res_norm1=phyl.pca(tree,Data,method="BM",mode="corr")
Eigenvalues_norm1=pca.res_norm1$Eval
Scores_norm1=matrix(rep(1,length(pca.res_norm1$S[,1])),length(pca.res_norm1$S[,1]),1)%*%apply(pca.res_norm1$S,2,foo)*pca.res_norm1$S
Loadings_norm1=pca.res_norm1$L
Loadings_norm1[,which(colSums(Scores_norm1 == pca.res_norm1$S)==0)]= -1 * pca.res_norm1$L[,which(colSums(Scores_norm1 == pca.res_norm1$S)==0)]
Eigenvectors_norm1=pca.res_norm1$Evec
Eigenvectors_norm1[,which(colSums(Scores_norm1 == pca.res_norm1$S)==0)]= -1 * pca.res_norm1$Evec[,which(colSums(Scores_norm1 == pca.res_norm1$S)==0)]

#Plot first two principal components against each other
biplot(pca.res_norm1,choices=c(1,2), cex=0.75)

##Calculate variance explained by each principal component
#list of matrices to contain percent variance
PercentVariance_norm1=matrix(NA,nrow=ncol(Data), ncol=1)
Total=sum(diag(Eigenvalues_norm1))
for (i in 1:ncol(Data)){
  PercentVariance_norm1[i]=(Eigenvalues_norm1[i,i]/Total)*100
}

#generate graph of first two PC axes coloured by Locomotor Mode
scores_norm1a<-data.frame(Scores_norm1)
#scores_norm1a$LocomotorMode<-linear_norm$LocomotorMode
lin_norm1<-ggplot(scores_norm1a,aes(PC1,PC2))+
  geom_point(aes())+#colour=LocomotorMode
  theme_classic()+
  ggtitle("Normalised - with (hetero) DV and CC")
lin_norm1+geom_text(label=rownames(scores_norm1a), hjust = "inward", size=3)
ggsave("C:/Users/SamCross/Desktop/DinoCoM/ForSam/Master/SamRerun/Neat/linear_norm_with_hetero_DV&CC_labelled.png",width = 10, height = 10, units = "in")

#Write PC scores to a spreadsheet
write.xlsx(Scores_norm1, "Linear_PC_Scores.xlsx", sheetName = "Norm_w_hetero_DC&CC", col.names = T, row.names = T)
write.xlsx(Loadings_norm1, "Linear_PC_Scores.xlsx", sheetName = "Norm_w_hetero_DC&CC_load", col.names = T, row.names = T, append = T)
write.xlsx(PercentVariance_norm1, "Linear_PC_Scores.xlsx", sheetName = "Norm_w_hetero_DC&CC_%var", col.names = T, row.names = T, append = T)