查看原文
其他

R语言微生物群落数据分析:从原始数据到结果(No2)

文涛聊科研 微生信生物 2022-05-08
library("phyloseq")
library("ggplot2")
library("gridExtra")
ps0 = readRDS("ps.rds")
ps0

ASV表格过滤

##ASV表格过滤
# Define prevalence of each taxa
# (in how many samples did each taxa appear at least once)
prev0 = apply(X = otu_table(ps0),
MARGIN = ifelse(taxa_are_rows(ps0), yes = 1, no = 2),
FUN = function(x){sum(x > 0)})
prevdf = data.frame(Prevalence = prev0,
TotalAbundance = taxa_sums(ps0),
tax_table(ps0))
keepPhyla = table(prevdf$Phylum)[(table(prevdf$Phylum) > 5)]
prevdf1 = subset(prevdf, Phylum %in% names(keepPhyla))
# Define prevalence threshold as 5% of total samples
prevalenceThreshold = 0.05 * nsamples(ps0)
prevalenceThreshold
## [1] 18
# Execute prevalence filter, using `prune_taxa()` function
ps1 = prune_taxa((prev0 > prevalenceThreshold), ps0)
ps1
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 353 taxa and 360 samples ]
## sample_data() Sample Data: [ 360 samples by 14 sample variables ]
## tax_table() Taxonomy Table: [ 353 taxa by 6 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 353 tips and 351 internal nodes ]
# Filter entries with unidentified Phylum.
ps2 = subset_taxa(ps1, Phylum %in% names(keepPhyla))
ps2

ggplot(prevdf1, aes(TotalAbundance, Prevalence, color = phylum)) +
geom_hline(yintercept = prevalenceThreshold, alpha = 0.5, linetype = 2) +
geom_point(size = 2, alpha = 0.7) +
scale_y_log10() + scale_x_log10() +
xlab("Total Abundance") +
facet_wrap(~phylum)

## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 341 taxa and 360 samples ]
## sample_data() Sample Data: [ 360 samples by 14 sample variables ]
## tax_table() Taxonomy Table: [ 341 taxa by 6 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 341 tips and 339 internal nodes ]

提取属等级

taxGlomRank = "genus"
length(get_taxa_unique(ps2, taxonomic.rank = taxGlomRank))
#表明一共有47个属
## [1] 47

ps3 = tax_glom(ps2, taxrank = taxGlomRank)
# Phylogenetic agglomeration
# How many genera are present after filtering?
h1 = 0.4
ps4 = tip_glom(ps2, h = h1)

可视化上面操作的进化树

multiPlotTitleTextSize = 8
p2tree = plot_tree(ps2,
method = "treeonly",
ladderize = "left",
title = "Before Agglomeration") +
theme(plot.title = element_text(size = multiPlotTitleTextSize))
p3tree = plot_tree(ps3,
method = "treeonly",
ladderize = "left",
title = paste0("tax_glom(..., taxrank='", taxGlomRank, "')")) +
theme(plot.title = element_text(size = multiPlotTitleTextSize))
p4tree = plot_tree(ps4,
method = "treeonly",
ladderize = "left",
title = paste0("tip_glom(..., h=", h1, ")")) +
theme(plot.title = element_text(size = multiPlotTitleTextSize))

# group plots together
grid.arrange(nrow = 1, p2tree, p3tree, p4tree)


##进行count丰度和相对丰度的比较
plot_abundance = function(physeq, ylabn = "",
Facet = "order",
Color = "phylum"){
mphyseq = psmelt(physeq)
mphyseq <- subset(mphyseq, Abundance > 0)
ggplot(data = mphyseq,
mapping = aes_string(x = "sex", y = "Abundance",
color = Color, fill = Color)) +
geom_violin(fill = NA) +
geom_point(size = 1, alpha = 0.3,
position = position_jitter(width = 0.3)) +
facet_wrap(facets = Facet) + ylab(ylabn) +
scale_y_log10()
}

# Transform to relative abundance. Save as new object.
ps3ra = transform_sample_counts(ps3, function(x){x / sum(x)})
plot_abundance(ps3,"Original Abundances")
plot_abundance(ps3ra,"Relative Abundances")

挑选指定的门类去展示

#挑选指定的分类等级去展示
psOrd = subset_taxa(ps3ra, order == "Lactobacillales")
plot_abundance(psOrd, Facet = "genus", Color = NULL)
qplot(sample_data(ps0)$age, geom = "histogram") + xlab("age")
qplot(log10(rowSums(otu_table(ps0)))) +
xlab("Logged counts-per-sample")

#对小鼠年龄进行统计
qplot(sample_data(ps0)$age, geom = "histogram") + xlab("age")
#统计样品的序列条数
qplot(log10(rowSums(otu_table(ps0)))) +
xlab("Logged counts-per-sample")

pslog <- transform_sample_counts(ps0, function(x) log(1 + x))
sample_data(pslog)$age_binned <- cut(sample_data(pslog)$age,
breaks = c(0, 100, 200, 400))
out.wuf.log <- ordinate(pslog, method = "MDS", distance = "unifrac")
evals <- out.wuf.log$values$Eigenvalues
plot_ordination(pslog, out.wuf.log, color = "age_binned") +
labs(col = "Binned Age") +
coord_fixed(sqrt(evals[2] / evals[1]))

setup_example(c("phyloseq", "ggplot2", "plyr", "dplyr", "reshape2",
"ade4", "ggrepel"))

out.bc.log <- ordinate(pslog, method = "MDS", distance = "bray")
evals <- out.bc.log$values$Eigenvalues
plot_ordination(pslog, out.bc.log, color = "age_binned") +
coord_fixed(sqrt(evals[2] / evals[1])) +
labs(col = "Binned Age")

out.dpcoa.log <- ordinate(pslog, method = "DPCoA")

evals <- out.dpcoa.log$eig
plot_ordination(pslog, out.dpcoa.log, color = "age_binned",
shape = "family_relationship") +
coord_fixed(sqrt(evals[2] / evals[1])) +
labs(col = "Binned Age", shape = "Litter")

plot_ordination(pslog, out.dpcoa.log, type = "species", color = "Phylum") +
coord_fixed(sqrt(evals[2] / evals[1]))

### pca

out.wuf.log <- ordinate(pslog, method = "PCoA", distance = "wunifrac")
abund <- otu_table(pslog)
abund_ranks <- t(apply(abund, 1, rank))

evals <- out.wuf.log$values$Eigenvalues
plot_ordination(pslog, out.wuf.log, color = "age_binned",
shape = "family_relationship") +
coord_fixed(sqrt(evals[2] / evals[1])) +
labs(col = "Binned Age", shape = "Litter")

plot_ordination(pslog, out.wuf.log, type = "species", color = "Phylum") +
coord_fixed(sqrt(evals[2] / evals[1]))


abund_df <- melt(abund, value.name = "abund") %>%
left_join(melt(abund_ranks, value.name = "rank"))
colnames(abund_df) <- c("sample", "seq", "abund", "rank")

abund_df <- melt(abund, value.name = "abund") %>%
left_join(melt(abund_ranks, value.name = "rank"))
colnames(abund_df) <- c("sample", "seq", "abund", "rank")

sample_ix <- sample(1:nrow(abund_df), 8)
ggplot(abund_df %>%
filter(sample %in% abund_df$sample[sample_ix])) +
geom_point(aes(x = abund, y = rank, col = sample),
position = position_jitter(width = 0.2), size = .7) +
labs(x = "Abundance", y = "Thresholded rank") +
scale_color_brewer(palette = "Set2")

abund_ranks <- abund_ranks - 329
abund_ranks[abund_ranks < 1] <- 1

ranks_pca <- dudi.pca(abund_ranks, scannf = F, nf = 3)
row_scores <- data.frame(li = ranks_pca$li,
SampleID = rownames(abund_ranks))

col_scores <- data.frame(co = ranks_pca$co,
seq = colnames(abund_ranks))


tax <- tax_table(ps)@.Data %>%
data.frame(stringsAsFactors = FALSE)
tax$seq <- rownames(tax)

main_orders <- c("Clostridiales", "Bacteroidales", "Lactobacillales",
"Coriobacteriales")
tax$order[!(tax$Order %in% main_orders)] <- "Other"
tax$order <- factor(tax$order, levels = c(main_orders, "Other"))
tax$otu_id <- seq_len(ncol(otu_table(ps)))

row_scores <- row_scores %>%
left_join(sample_data(pslog))
col_scores <- col_scores %>%
left_join(tax)

evals_prop <- 100 * (ranks_pca$eig / sum(ranks_pca$eig))
ggplot() +
geom_point(data = row_scores, aes(x = li.Axis1, y = li.Axis2), shape = 2) +
geom_point(data = col_scores, aes(x = 25 * co.Comp1, y = 25 * co.Comp2, col = order),
size = .3, alpha = 0.6) +
scale_color_brewer(palette = "Set2") +
facet_grid(~ age_binned) +
guides(col = guide_legend(override.aes = list(size = 3))) +
labs(x = sprintf("Axis1 [%s%% variance]", round(evals_prop[1], 2)),
y = sprintf("Axis2 [%s%% variance]", round(evals_prop[2], 2))) +
coord_fixed(sqrt(ranks_pca$eig[2] / ranks_pca$eig[1])) +
theme(panel.border = element_rect(color = "#787878", fill = alpha("white", 0)))

ps_ccpna <- ordinate(pslog, "CCA", formula = pslog ~ age_binned + family_relationship)
ps_scores <- vegan::scores(ps_ccpna)
sites <- data.frame(ps_scores$sites)
sites$SampleID <- rownames(sites)
sites <- sites %>%
left_join(sample_data(ps))

species <- data.frame(ps_scores$species)
dim(species)
species$otu_id <- seq_along(colnames(otu_table(ps))[-414])
species <- species %>%
left_join(tax)

library("ggrepel")
evals_prop <- 100 * ps_ccpna$CCA$eig[1:2] / sum(ps_ccpna$CA$eig)
ggplot() +
geom_point(data = sites, aes(x = CCA1, y = CCA2), shape = 2, alpha = 0.5) +
geom_point(data = species, aes(x = CCA1, y = CCA2, col = order), size = 0.5) +
geom_text_repel(data = species %>% filter(CCA2 < -2),
aes(x = CCA1, y = CCA2, label = otu_id),
size = 1.5, segment.size = 0.1) +
facet_grid(. ~ age_binned) +
guides(col = guide_legend(override.aes = list(size = 3))) +
labs(x = sprintf("Axis1 [%s%% variance]", round(evals_prop[1], 2)),
y = sprintf("Axis2 [%s%% variance]", round(evals_prop[2], 2))) +
scale_color_brewer(palette = "Set2") +
coord_fixed(sqrt(ps_ccpna$CCA$eig[2] / ps_ccpna$CCA$eig[1])) +
theme(panel.border = element_rect(color = "#787878", fill = alpha("white", 0)))

ggplot() +
geom_point(data = sites, aes(x = CCA1, y = CCA2), shape = 2, alpha = 0.5) +
geom_point(data = species, aes(x = CCA1, y = CCA2, col = order), size = 0.5) +
geom_text_repel(data = species %>% filter(CCA2 < -2),
aes(x = CCA1, y = CCA2, label = otu_id),
size = 1.5, segment.size = 0.1) +
facet_grid(. ~ family_relationship) +
guides(col = guide_legend(override.aes = list(size = 3))) +
labs(x = sprintf("Axis1 [%s%% variance]", round(evals_prop[1], 2)),
y = sprintf("Axis2 [%s%% variance]", round(evals_prop[2], 2))) +
scale_color_brewer(palette = "Set2") +
coord_fixed(sqrt(ps_ccpna$CCA$eig[2] / ps_ccpna$CCA$eig[1])) +
theme(panel.border = element_rect(color = "#787878", fill = alpha("white", 0)))


pls监督模型分类预测

library(caret)
setup_example(c("phyloseq", "ggplot2", "caret", "plyr", "dplyr"))
sample_data(pslog)$age2 <- cut(sample_data(ps0)$age, c(0, 100, 400))
dataMatrix <- data.frame(age = sample_data(pslog)$age2, otu_table(pslog))
# take 8 mice at random to be the training set, and the remaining 4 the test set
trainingMice <- sample(unique(sample_data(pslog)$host_subject_id), size = 8)
inTrain <- which(sample_data(ps)$host_subject_id %in% trainingMice)
length(inTrain)
inTrain=inTrain[1:228]
training <- dataMatrix[inTrain,]
testing <- dataMatrix[-inTrain,]


plsFit <- train(age ~ ., data = training,
method = "pls", preProc = "center")

# 预测
plsClasses <- predict(plsFit, newdata = testing)
table(plsClasses, testing$age)

# 换个算法继续尝试
library("randomForest")
rfFit <- train(age ~ ., data = training, method = "rf",
preProc = "center", proximity = TRUE)
rfClasses <- predict(rfFit, newdata = testing)
table(rfClasses, testing$age)

library("pls")
pls_biplot <- list("loadings" = loadings(plsFit$finalModel),
"scores" = scores(plsFit$finalModel))
class(pls_biplot$scores) <- "matrix"
pls_biplot$scores <- data.frame(sample_data(pslog)[inTrain, ],
pls_biplot$scores)
tax <- tax_table(ps)@.Data %>%
data.frame(stringsAsFactors = FALSE)
main_orders <- c("Clostridiales", "Bacteroidales", "Lactobacillales",
"Coriobacteriales")
tax$order[!(tax$order %in% main_orders)] <- "Other"
tax$order <- factor(tax$order, levels = c(main_orders, "Other"))
class(pls_biplot$loadings) <- "matrix"
pls_biplot$loadings <- data.frame(tax, pls_biplot$loadings)

ggplot() +
geom_point(data = pls_biplot$scores,
aes(x = Comp.1, y = Comp.2), shape = 2) +
geom_point(data = pls_biplot$loadings,
aes(x = 25 * Comp.1, y = 25 * Comp.2, col = order),
size = 0.3, alpha = 0.6) +
scale_color_brewer(palette = "Set2") +
labs(x = "Axis1", y = "Axis2", col = "Binned Age") +
guides(col = guide_legend(override.aes = list(size = 3))) +
facet_grid( ~ age2) +
theme(panel.border = element_rect(color = "#787878", fill = alpha("white", 0)))



rf_prox <- cmdscale(1 - rfFit$finalModel$proximity) %>%
data.frame(sample_data(pslog)[inTrain, ])
ggplot(rf_prox) +
geom_point(aes(x = X1, y = X2, col = age_binned),
size = .4, alpha = 0.6) +
scale_color_manual(values = c("#A66EB8", "#238DB5", "#748B4F")) +
guides(col = guide_legend(override.aes = list(size = 3))) +
labs(col = "Binned Age", x = "Axis1", y = "Axis2")


as.vector(tax_table(ps)[which.max(importance(rfFit$finalModel)), c("family", "genus")])

## [1] "Lachnospiraceae" "Roseburia"

impOtu <- as.vector(otu_table(pslog)[,which.max(importance(rfFit$finalModel))])
maxImpDF <- data.frame(sample_data(pslog), abund = impOtu)
ggplot(maxImpDF) +
geom_histogram(aes(x = abund)) +
facet_grid(age2 ~ .) +
labs(x = "Abundance of discriminative microbe", y = "Number of samples")

setup_example(c("igraph", "phyloseq", "phyloseqGraphTest", "ggnetwork", "intergraph"))

net <- make_network(ps, max.dist=0.35)
sampledata <- data.frame(sample_data(ps))
V(net)$id <- sampledata[names(V(net)), "host_subject_id"]
V(net)$litter <- sampledata[names(V(net)), "family_relationship"]

class(V(net)$aaa)
V(net)$litter = as.character(V(net)$aaa )


ggplot(net, aes(x = x, y = y, xend = xend, yend = yend), layout = "fruchtermanreingold") +
geom_edges(color = "darkgray") +
geom_nodes(aes(color = id, shape = litter)) +
theme(axis.text = element_blank(), axis.title = element_blank(),
legend.key.height = unit(0.5,"line")) +
guides(col = guide_legend(override.aes = list(size = .25)))


gt <- graph_perm_test(ps, "family_relationship",
grouping = "host_subject_id",
distance = "jaccard", type = "mst")
gt$pval

plot_test_network(gt) +
theme(legend.text = element_text(size = 8),
legend.title = element_text(size = 9))
plot_permutations(gt)

gt <- graph_perm_test(ps, "family_relationship",
grouping = "host_subject_id",
distance = "jaccard", type = "knn", knn = 1)
gt$pval

plot_test_network(gt) +
theme(legend.text = element_text(size = 8),
legend.title = element_text(size = 9))
plot_permutations(gt)

gt <- graph_perm_test(ps, "family_relationship",
grouping = "host_subject_id",
distance = "bray", type = "knn", knn = 2)
gt$pval

plot_test_network(gt) +
theme(legend.text = element_text(size = 8),
legend.title = element_text(size = 9))
plot_permutations(gt)



gt <- graph_perm_test(ps, "family_relationship", grouping = "host_subject_id",
distance = "bray", type = "threshold.nedges", nedges = 720,
keep.isolates = FALSE)
gt$pval

plot_test_network(gt) +
theme(legend.text = element_text(size = 8),
legend.title = element_text(size = 9))
plot_permutations(gt)

gt <- graph_perm_test(ps, "family_relationship", grouping = "host_subject_id",
distance = "bray", type = "threshold.nedges", nedges = 2000,
keep.isolates = FALSE)
gt$pval


plot_test_network(gt) +
theme(legend.text = element_text(size = 8),
legend.title = element_text(size = 9))
plot_permutations(gt)


您可能也对以下帖子感兴趣

文章有问题?点此查看未经处理的缓存