18 GSEA

library(tidyverse)
library(msigdbr)
library(clusterProfiler)
library(fgsea)

packageVersion("msigdbr")


# if (!require("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# 
# BiocManager::install("org.EcK12.eg.db")

clusterProfiler 패키지 사용을 위해서는 다음과 같이 설치를 수행함
만약 설치시 libglpk.so.40 관련 에러가 발생할 경우 apt-get install libglpk40 수행

#if (!require("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")

#BiocManager::install("clusterProfiler")

# if (!require("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# 
# BiocManager::install("ReactomePA")

# if (!require("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# 
# BiocManager::install("enrichplot", force = TRUE)


library(clusterProfiler)
library(tidyverse)
library(org.EcK12.eg.db)
library(ReactomePA)
library(enrichplot)
library(Biostrings)

library(genbankr)

18.1 Geneset database

A gene set is an unordered collection of genes that are functionally related. A pathway can be interpreted as a gene set by ignoring functional relationships among genes.

Gene Ontology (GO)
Kyoto Encyclopedia of Genes and Genomes (KEGG)
Reactome (https://reactome.org/)
org.db

library(org.EcK12.eg.db)

str(org.EcK12.eg.db)
class(org.EcK12.eg.db)
columns(org.EcK12.eg.db)

ls("package:org.EcK12.eg.db")

x <- org.EcK12.egENZYME
mapped_genes <- mappedkeys(x)
xx <- as.list(x[mapped_genes]
xx <- as.list(org.EcK12.egENZYME2EG)
              
as.list(org.EcK12.egALIAS2EG)[1:10]
as.list(org.EcK12.egACCNUM2EG)[1:2]


as.list(org.EcK12.egPATH)
?org.EcK12.egPATH
x <- org.EcK12.egPATH
mapped_genes <- mappedkeys(x)
xx <- as.list(x[mapped_genes])

as.list(org.EcK12.egACCNUM) %>% head
as.list(org.EcK12.egACCNUM2EG) %>% head
as.list(org.EcK12.egPATH)

18.2 Read essential data

Essential 데이터 읽고 분류
cgenes: CNN 모형이 예측한 결과
apgenes: APT 모형이 예측한 결과

library(Biostrings)
fileno <- "040"

name_md = paste0(fileno, "-AP_ConvLSTM_relEC")
name_date = "-220619"

tmps <- readDNAStringSet(paste0("06-PredData/", name_md, name_date, "_trainingset_results.fasta"))

geneinfo <- strsplit(names(tmps), split="__") %>% 
  do.call(rbind, .) %>% 
  as.data.frame 
colnames(geneinfo) <- c("no", "ess", "pred", "id", "acc")
essdata <- geneinfo %>% 
  mutate(esstrue=ess=="ess", esspred=pred==1) %>% 
  dplyr::select(-c(no, ess, pred))


table(essdata$esstrue, essdata$esspred)

geneinfo2 <- essdata %>% 
  filter(acc!="-") %>% 
  #separate(Gene, c("bnum", "accnum"), sep="__") %>% 
  separate(acc, c("accnum", "tmp"), sep="\\.") %>% 
  dplyr::select(-tmp)

  # mutate(egnum = {
  #   x = as.list(org.EcK12.egACCNUM2EG)[accnum]
  #   x[sapply(x, is.null)] <- NA
  #   unlist(x, use.names = FALSE)
  #   }) %>% 
  # mutate(fcval = C_Prob + 1) ## for the use of gseGO

read genbank file

#gbinfo <- readGenBank("03-PrepData/BW25113_CP009273.gb")

geneinfo3 <- mcols(gbinfo@cds) %>% 
  as.data.frame %>% 
  dplyr::select(gene, locus_tag) %>% 
  left_join(geneinfo2, by=c("locus_tag"="id")) %>% 
  mutate(egnum = {
    x = as.list(org.EcK12.egSYMBOL2EG)[gene]
    x[sapply(x, is.null)] <- NA
    unlist(x, use.names = FALSE)
    })


#selgenes018 <- geneinfo3
selgenes040 <- geneinfo3

selgenes018 %>% 
  inner_join(selgenes040, by=c("locus_tag"="locus_tag")) %>% 
  filter(esstrue.x == T & esstrue.y == T)

18.3 GO enrichment analysis

ACCnumber를 Entrez gene id로 전환
enrichGO 함수 사용


selgenes <- geneinfo3 %>%
  filter(esspred==T & esstrue==F)

ggo <- groupGO(gene     = selgenes$egnum,
               OrgDb    = org.EcK12.eg.db,
               ont      = "CC",
               level    = 3,
               readable = TRUE)

ggo@result %>% 
  filter(Count > 0) %>% 
  arrange(desc(Count))
  


egobp <- enrichGO(gene          = selgenes$egnum,
                OrgDb         = org.EcK12.eg.db,
                ont           = "BP",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.1,
                qvalueCutoff  = 0.5,
        readable      = TRUE)
goplot(egobp)
egobp@result %>% 
  filter(p.adjust < 0.1 & qvalue < 0.5)
  

egocc <- enrichGO(gene          = selgenes$egnum,
                OrgDb         = org.EcK12.eg.db,
                ont           = "CC",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.1,
                qvalueCutoff  = 0.5,
        readable      = TRUE)
#goplot(egocc)
egocc@result

egomf <- enrichGO(gene          = selgenes$egnum,
                OrgDb         = org.EcK12.eg.db,
                ont           = "MF",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.1,
                qvalueCutoff  = 0.5,
        readable      = TRUE)
#egomf@result %>% 
#  filter(if_any(where(is.numeric), ~!is.na(.))) 
#goplot(egomf)
egomf@result

GO:0009244 lipopolysaccharide core region biosynthetic process
GO:0046401    lipopolysaccharide core region metabolic process
GO:0009103             lipopolysaccharide biosynthetic process
GO:0008653                lipopolysaccharide metabolic process
GO:1903509                    liposaccharide metabolic process
GO:0009312                oligosaccharide biosynthetic process
GO:0006629                             lipid metabolic process
GO:0044255                    cellular lipid metabolic process
GO:0034637          cellular carbohydrate biosynthetic process
GO:0008610                          lipid biosynthetic process
GO:0051246             regulation of protein metabolic process
GO:0033692        cellular polysaccharide biosynthetic process
GO:0009628                        response to abiotic stimulus

18.4 KEGG enrichment analysis

KEGG
Supported organisms (http://www.genome.jp/kegg/catalog/org_list.html)

search_kegg_organism('eco', by='kegg_code')
ecoli <- search_kegg_organism('Escherichia coli', by='scientific_name')

over-representation analysis


selgenes <- selgenes018 %>% 
  filter(esspred==T & esstrue==F) 

kk <- enrichKEGG(gene         = selgenes$egnum,
                 organism     = 'eco',
                 keyType = "ncbi-geneid",
                 pvalueCutoff = 0.1)

kk@result %>% 
  dplyr::select(Description, qvalue)

selgenes <- selgenes040 %>% 
  filter(esspred==T & esstrue==F) 
kk <- enrichKEGG(gene         = selgenes$egnum,
                 organism     = 'eco',
                 keyType = "ncbi-geneid",
                 pvalueCutoff = 0.1)

kk@result %>% 
  dplyr::select(Description, qvalue)

018

eco00540    Lipopolysaccharide biosynthesis 0.00531493  
eco00030    Pentose phosphate pathway   0.12748791  
eco00650    Butanoate metabolism    0.17372322  
eco03440    Homologous recombination    0.20574266  
eco01250    Biosynthesis of nucleotide sugars   0.20574266

040
eco00540    Lipopolysaccharide biosynthesis 0.002359464
eco00650    Butanoate metabolism    0.050898735
eco02020    Two-component system    0.156421239
eco01250    Biosynthesis of nucleotide sugars   0.156421239
eco02010    ABC transporters    0.185342182
eco00030    Pentose phosphate pathway   0.396099588

library(pathview)

browseKEGG(kk, 'eco00540')
browseKEGG(kk, 'eco00030')
browseKEGG(kk, 'eco00650')

tmp <- pathview(gene.data  = selgenes$egnum,
                     pathway.id = "eco00540",
                     species    = "eco")

tmp <- pathview(gene.data  = selgenes$egnum,
                     pathway.id = "eco00030",
                     species    = "eco")

tmp <- pathview(gene.data  = selgenes$egnum,
                     pathway.id = "eco00650",
                     species    = "eco")

kk@result

gene set enrichment analysis


fcval <- selgenes$fcval
names(fcval) <- selgenes$egnum
fcval <- sort(fcval, decreasing = T)

kk <- gseKEGG(gene         = fcval,
                 organism     = 'eco',
                 keyType = "ncbi-geneid",
                  scoreType = "pos",
                 pvalueCutoff = 1)

mkk2 <- gseMKEGG(geneList = fcval,
                 organism     = 'eco',
                 keyType = "ncbi-geneid",
                 scoreType = "pos",
                 pvalueCutoff = 1)

kk@result

selgenes018 %>% 
  inner_join(selgenes040, by=c("locus_tag"="locus_tag"))

18.5 Reactome

17 DEG Excercise

19 Lecture Note