19 Lecture Note
19.1 Lecture 02 (0526)
설명이나 실습을 위해 R 사용법 및 데이터 분석 기초 5.19(목), 5.26(목)
강의를 진행하며 작성한 코드입니다.
numeric vector
- Ctrl + Alt + i 누르면 코드청크 생성
- 커서를 해당 라인에 두구 Ctrl + Enter 누르면 해당 라인 실행
2 + 2
((2-1)^2 + (1-3)^2 )^(1/2)
2 + 2; 2 - 2
- 기억할 단축키
- Ctrl + 1 : 편집창
- Ctrl + 2 : 콘솔창
sqrt((4+3)*(2+1))
2^3 + 3^2
- 변수에 값 저장하기
x <- 1
y <- 2
z <- x + y
- 변수 값 보기
- numeric vector 만들기
x <- 1:5
x
y <- seq(1, 5, 1)
y <- seq(from=1, to=5, by=1)
y <- seq(to=5, from=1, by=1)
y <- seq(5, 1, -1)
?seq
y
- 연습문제 풀이
- vector의 인덱싱 (첫번째 값의 인덱스는 1부터 시작)
- logical vector 설명
- 기본 그래픽 함수 이용하는 방법은 필요할때만 설명
-
which
함수 활용한 40 이상만 선택
precip > 40
precip[precip > 40]
idx <- precip > 40
which(idx)
myprecip <- precip[which(idx)]
myprecip
plot(myprecip)
-
which
함수 활용한 짝수 만들기
mynumers <- 1:1000
mynumers_res <- mynumers %% 2
i <- which(mynumers_res == 0)
evens <- mynumers[i]
evens
- 홀수 값을 저장하는 벡터 만들고 하나씩 샘플링 (
sample
함수 사용)
odds <- seq(1, 1000, 2)
length(evens)
length(odds)
?sample
mysample <- c(sample(evens, 1), sample(odds, 1))
print(mysample[1])
- 문자열 붙이기,
,
로 나누어진 벡터들 각각의 원소를 붙여줌
paste("X", "Y", "Z", sep="_")
paste("X", "Y", "Z", sep="")
paste("X", "Y", "Z", "X", "Y", "Z", "X", sep="")
- 여러 벡터에서 각각의 원소를 붙여주는 기능
- collapse (하나의 벡터에서 해당 벡터의 원소들을 붙여주는 기능)
x <- sample(c("A", "C", "G", "T"), size=20, replace = T)
x2 <- paste(x, collapse = "")
myseq <- rep("", 5)
myseq[1] <- paste(sample(c("A", "C", "G", "T"), size=20, replace = T), collapse="")
myseq[2] <- paste(sample(c("A", "C", "G", "T"), size=20, replace = T), collapse="")
myseq[3] <- paste(sample(c("A", "C", "G", "T"), size=20, replace = T), collapse="")
myseq[4] <- paste(sample(c("A", "C", "G", "T"), size=20, replace = T), collapse="")
myseq[5] <- paste(sample(c("A", "C", "G", "T"), size=20, replace = T), collapse="")
myseq
- 예제 ‘TAAGTCT’ 바코드를 각 서열의 3’에 붙여보기
-
split
함수 사용
-
factor
간단한 설명
x <- c("Red", "Blue", "Yellow", "Red", "Blue")
x
y <- factor(x)
y
y[1] <- "gold"
levels(y)[4] <- "gold"
y
y[1] <- "gold"
y
- 데이터에서
factor
들 보기
- 아미노산 및 각 아미노산에 해당하는 코돈 표현 예제
aa <- c("Phe", "Leu", "Ser")
class(aa)
aa <- factor(aa)
class(aa)
aa <- list()
aa[[1]] <- c("UUU", "UUC")
aa
aa[[2]] <- c("UUA", "UUG", "CUA", "CUU", "CUG", "CUC")
aa
class(aa)
names(aa) <- c("Phe", "Leu")
aa
aa[[1]][1]
aa[[2]][3]
- useful functions
matrix
- 연습문제 풀이 성적별 테이블 정렬
mynum <- sample(1:100, 20, T)
mynum
score <- matrix(mynum, nrow = 10, ncol = 2)
score
myrowname <- paste("Name", 1:10, sep="")
myrowname
rownames(score) <- myrowname
score
colnames(score) <- c("Math", "Eng")
total_score <- score[,1] + score[,2]
total_score <- score[,"Math"] + score[,"Eng"]
sort(total_score, decreasing = T)
score
o <- order(total_score, decreasing = T)
o
score[o,]
data.frame
math_score <- sample(1:100, 10, T)
eng_score <- sample(1:100, 10, T)
score <- data.frame(math_score, eng_score)
score
score$math_score
score$eng_score
- 추가 연습문제: 수학, 영어 성적을 더해서
total_score
를 만들고 이 값을 기준으로 내림차순으로 score 데이터프레임을 정렬 하시오.
total_score <- score$math_score + score$eng_score
o <- order(total_score, decreasing = T)
score[o,]
list
score
class(score)
mynum
class(mynum)
z <- list()
z[[1]] <- score
z[[2]] <- mynum
z
names(z) <- c("dataframe", "numericvector")
z
z$dataframe
z$numericvector
- 리스트 만들때 미리 원소의 개수를 알고 있으면 그 원소의 개수에 맞게 생성해 주는 것이 좋음
aa <- vector("list", 5)
- 각 아미노산에 해당하는 코돈 길이가 달라도 list 형태로 저장 가능
aa <- list()
aa[[1]] <- c("UUU", "UUC")
aa
aa[[2]] <- c("UUA", "UUG", "CUA", "CUU", "CUG", "CUC")
aa
class(aa)
names(aa) <- c("Phe", "Leu")
aa
as.data.frame(aa)
- 바람직한 데이터는 column은 변수, row는 샘플 구조의 데이터. 예를 들어서, 변수:먹이, 수명 –> 컬럼, 샘플: 마우스1, 마우스2 –> Row, 등
aa <- c("Phe", "Leu")
codon <- c("UUU", "UUC", "UUA", "UUG", "CUA", "CUU", "CUG", "CUC")
data.frame(aa, codon)
aa <- c(rep("Phe", 2), rep("Leu",6))
codon <- c("UUU", "UUC", "UUA", "UUG", "CUA", "CUU", "CUG", "CUC")
data.frame(aa, codon)
functions
source("myscript.R")
- 함수만들기
my_function_name <- function(parameter1, parameter2, ... ){
##any statements
return(object)
}
## mynumers: numeric vector
mymean <- function(mynumers){
#cat("Input numbers are", mynumers, "\n")
numbers_mean <- sum(mynumers)/length(mynumers)
#out <- paste("The average is ", numbers_mean, ".\n", sep="")
#cat(out)
return(numbers_mean)
}
- 함수 만든 후 loading 할 때
{
,}
밖에서 Ctrl + Enter로 로딩
- 데이터 표준화 예제
mysd <- function(x){
numbers_sd <- sqrt(sum((x - mymean(x))^2)/(length(x)-1))
return(numbers_sd)
}
#x <- sample(1:100, 1000, T)
x <- rnorm(1000, 10, 5)
z <- (x - mymean(x))/mysd(x)
mymean(z)
mysd(z)
mymean(x)
mysd(x)
plot(density(x))
density(x)
- 코드비교
for
x <- 1:10
for(i in x){
cat(i, "\n")
flush.console()
}
x <- 1
while(x <= 10){
cat(x, "\n")
flush.console()
x <- x + 1
}
- 랜덤 서열만들기 예제
x <- sample(c("A", "C", "G", "T"), size=20, replace = T)
x2 <- paste(x, collapse = "")
myseq <- rep("", 5)
myseq[1] <- paste(sample(c("A", "C", "G", "T"), size=20, replace = T), collapse="")
myseq[2] <- paste(sample(c("A", "C", "G", "T"), size=20, replace = T), collapse="")
myseq[3] <- paste(sample(c("A", "C", "G", "T"), size=20, replace = T), collapse="")
myseq[4] <- paste(sample(c("A", "C", "G", "T"), size=20, replace = T), collapse="")
myseq[5] <- paste(sample(c("A", "C", "G", "T"), size=20, replace = T), collapse="")
numseq <- 7
myseq <- rep("", numseq)
for(i in 1:length(myseq)){
x <- sample(c("A", "C", "G", "T"), size=20, replace = T)
myseq[i] <- paste(x, collapse="")
}
myseq
Data transformation
-
UsingR
패키지의 babies 데이터셋을 적절히 변환하는 예제.with
와within
활용법 알아두기 (설명 안 함)
library(UsingR)
head(babies)
str(babies)
new_babies <- within(babies, {
gestation[gestation==999] <- NA
dwt[dwt==999] <- NA
smoke = factor(smoke)
levels(smoke) = list(
"never" = 0,
"smoke now" = 1,
"until current pregnancy" = 2,
"once did, not now" = 3)
})
str(new_babies)
fit <- lm(gestation~smoke, new_babies)
summary(fit) ## t-test 결과
anova(fit)
19.2 Lecture 03 (0609)
Class 01
- file write
- package 설치
- package 사용하기 위해서는
library
로 불러와야 함 - ctrl + enter 누르면 코드청크 코드 실행 (커서가 위치한)
library(UsingR)
batting
?batting
str(batting)
batting$playerID
batting$HR
batting$SO
mydata <- data.frame(batting$playerID, batting$HR, batting$SO)
str(mydata)
mydata
mydata <- data.frame(playerID = batting$playerID,
HR = batting$HR,
SO = batting$SO)
mydata
- file writing
- Arguments 이름을 지정할 경우 순서를 바꿔도 됨
write.table(x=mydata, file="mydata.txt")
?write.table
write.table(mydata,
file = "mydata.csv",
quote = F,
row.names = FALSE,
sep = ",")
- file read
myread <- read.table("mydata.csv", sep=",", header = T)
myread
str(myread)
?read.table
myread$HR
- 상관계수
- 회귀모형
plot(myread$HR, myread$SO)
mycor <- cor(myread$HR, myread$SO)
mycor
fit <- lm(myread$HR ~ myread$SO)
plot(myread$HR, myread$SO)
abline(fit)
text(50, 170, round(mycor,2))
- 엑셀파일 읽기
library(readxl)
#read.table("mydata.xlsx")
mydf <- read_xlsx("mydata.xlsx")
str(mydf)
class(mydf)
mydf$playerID
Class 2
- apply
- 반복작업
#library(UsingR)
mydata <- data.frame(playerID = batting$playerID,
HR = batting$HR,
SO = batting$SO)
mean(mydata$SO)
mean(mydata[,3])
mean(mydata$HR)
mean(mydata[,2])
mymean <- rep(0, 2)
mymean <- c(0, 0)
for(i in 1:2){
mymean[i] <- mean(mydata[,i+1])
}
mymean
# ctrl + shift + c 를 누르면 주석
# x <- 1:10
# for(i in x){
# cat(i, "\n")
# flush.console()
# }
- apply 사용
- airquality data example
?airquality
data(airquality)
str(airquality)
airquality
grp <- airquality$Month
class(grp)
grpf <- factor(grp)
airlist <- split(airquality, grpf)
?split
class(airlist)
airlist
- list 설명
a <- 1:100
b <- 11:111
class(a)
class(b)
length(a)
length(b)
mydf <- data.frame(a, b)
mylist <- list(a=a, b=b)
mylist
mylist$a
mylist$b
mylist[[1]]
mylist$a
- ozone 의 평균 구하는 함수 만들기
length(airlist)
airlist$`9`
class(airlist[[5]])
mean(airlist[[5]]$Ozone)
airlist[[5]]$Ozone
?mean
mean(airlist[[5]]$Ozone, na.rm=T)
- list 의 ozone 별 평균
airlist
mymean <- c(0,0,0,0,0)
mymean[1] <- mean(airlist[[1]]$Ozone, na.rm=T)
mymean[2] <- mean(airlist[[2]]$Ozone, na.rm=T)
mymean[3] <- mean(airlist[[3]]$Ozone, na.rm=T)
mymean[4] <- mean(airlist[[4]]$Ozone, na.rm=T)
mymean[5] <- mean(airlist[[5]]$Ozone, na.rm=T)
mymean
lapply(airlist, function(x){mean(x$Ozone, na.rm=T)})
myozone <- function(x){
z <- mean(x$Ozone, na.rm=T)
return(z)
}
lapply(airlist, myozone)
Class 3
- graphics
- 산포도
x <- c(1:100)
y <- x*2 + rnorm(100)
myxy <- data.frame(x,y)
plot(myxy)
plot(myxy$x, myxy$y)
plot(x=myxy$x, y=myxy$y)
plot(y~x, data=myxy)
- histogram
x <- rnorm(100)
hist(x,
br=20,
xlim=c(-3,3),
main="Main text",
xlab="X label",
ylab="y label")
airquality$Wind
hist(airquality$Wind, br=50)
hist(airquality$Wind, br=10)
- boxplot
x <- rnorm(100)
class(x)
boxplot(x)
mydf <- airquality[,c(1, 2, 3, 4)]
mydf <- airquality[,c("Ozone", "Solar.R", "Wind", "Temp")]
class(mydf)
boxplot(mydf)
- barplot
x <- sample(1:12, 200, replace = T)
x
tab_x <- table(x)
y <- sample(1:12, 200, replace = T)
tab_y <- table(y)
tab_xy <- rbind(tab_x, tab_y)
barplot(tab_xy)
barplot(tab_xy, beside = T)
barplot(tab_xy, beside = T, col=c("darkblue","red"))
barplot(tab_xy, beside = T, col=c("darkblue","red"), xlab="Month")
barplot(tab_xy, beside = T, col=c("darkblue","red"), xlab="Month", horiz=TRUE, legend.text = c("x", "y"))
x <- rnorm(500)
hist(x, 100)
y <- 2*x + rnorm(500, mean=5, sd=1)
z <- c(x,y)
hist(z, br=100)
hist(z, br=100, probability = T)
zd <- density(z)
lines(zd)
x <- rnorm(500)
y <- 2*x + rnorm(500, mean=5, sd=1)
myxy <- data.frame(x, y)
myxy
plot(x, y, data=myxy, xlim=c(-5, 5), ylim=c(-5, 15), pch=3)
idx <- which(x<0)
points(x[idx], y[idx], col="red")
fit <- lm(y~x)
abline(fit)
plot(y~x, data=myxy, xlim=c(-5, 5), ylim=c(-5, 15), pch=3)
idx <- which(x<0)
points(myxy[idx,], col="red")
fit <- lm(y~x, data=myxy)
abline(fit)
- tidyverse
- 우선 필요한 패키지 설치 및 로딩
#library(tidyverse)
#install.packages("tibble")
#install.packages("dplyr")
#install.packages("tidyr")
library(tibble)
library(dplyr)
library(tidyr)
- code chunk shortcut CTRL + ALT + I
df1 <- data.frame(x = 1:3, y = 3:1)
class(df1)
df1
df2 <- tibble(df1)
class(df2)
airquality
myair <- airquality[1:5,]
myair_long <- pivot_longer(myair, cols = c("Ozone", "Solar.R", "Wind", "Temp"))
myair_long
myair_long2 <- pivot_longer(myair, c(Ozone, Solar.R, Wind, Temp))
myair_long2
myair_long3 <- pivot_longer(myair, !c(Month, Day))
myair_long3
?pivot_longer
myair_long <- pivot_longer(myair,
c(Ozone, Solar.R, Wind, Temp),
names_to = "Type",
values_to = "Observation")
myair_long
stocks <- tibble(
year = c(2015, 2015, 2016, 2016),
month = c( 1, 2, 1, 2),
profit = c(1.88, 0.59, 0.92, 0.17)
)
stocks
pivot_wider(stocks, names_from = year, values_from = profit)
?pivot_wider
A 41 O M D A 190 S M D
- dplyr
- pipe operator 단축키 shift + ctrl + m
library(dplyr)
x <- 1:100
y <- mean(x)
z <- sin(y)
sqrt(z)
sqrt(sin(mean(1:100)))
1:100 %>% mean %>% sin %>% sqrt
1:100 %>%
mean %>%
sin %>%
sqrt
x <- 1:5
paste(x, "a", sep="-")
x %>%
paste("a", sep="-") %>%
paste(collapse = ":")
- filter
head(iris)
iris %>% head
iris %>% filter(Species=="setosa")
iris %>% filter(Species=="setosa" | Species=="versicolor")
iris %>% filter(Species=="setosa" & Species=="versicolor")
iris %>%
filter(Species=="setosa" | Species=="versicolor") %>%
str
iris %>% select(Species, everything()) %>% head(5)
iris %>% select(Species, everything())
iris %>% select(-Species)
iris %>% select(Petal.Length, starts_with('S'))
iris %>% select(starts_with('S'))
iris %>% select(obs = starts_with('S'))
iris
irisratio <- iris$Sepal.Length/iris$Sepal.Width
iris2 <- cbind(iris, irisratio)
iris2 <- iris %>% mutate(sepal_ratio = Sepal.Length/Sepal.Width)
head(iris2)
iris %>% summarise(m1 = mean(Sepal.Length), m2 = mean(Sepal.Width))
iris %>%
group_by(Species) %>%
summarise(mean(Sepal.Width))
- airquality 평균
19.3 Lecture 04 (0616)
Class 1
- 강의노트 주소 https://greendaygh.github.io/kribbr2022/index.html
- Rmarkdown 사용시 코드 청크 입력 Ctrl + Alt + i
- tidyverse 설치시 dbplyr 패키지 문제로 설치가 되지 않음. 우선 아래 4개 패키지 개별 설치 후 진행
#install.packages("tidyverse")
install.packages("tibble")
install.packages("tidyr")
install.packages("dplyr")
install.packages("ggplot2")
- Shortcut Ctrl + enter: 커서 있는 라인 콘솔에서 실행
- 인덱싱에 의한 subset과
subset
함수를 이용한 subset
library(UsingR)
str(babies)
head(babies)
## variables
newbabies <- babies[,c("gestation", "dwt")]
head(newbabies)
newbabies[newbabies==999] <- NA
newbabies
df <- subset(babies, select=c(gestation, wt, dwt))
df
mean(df$gestation)
mean(df$wt)
mean(df$dwt)
apply(df, 2, mean)
?apply
- 인덱스 (숫자)로만 인덱싱하는 방식은 지양함 (가독률 낮음)
-
subset
함수 사용할 수 있으며tidyverse
패키지의dplyr::select
사용 추천
-
apply
,lapply
많이 씀
airquality
str(airquality)
class(airquality)
grp <- factor(airquality$Month)
grp
class(grp)
airlist <- split(airquality, grp)
airlist
class(airlist)
# remove Month, Day
df <- subset(airquality, select=-c(Month, Day))
df
df2 <- split(df, grp)
length(df2)
lapply(df2, colMeans)
colMeans(df2$`5`, na.rm = T)
lapply(df2, colMeans, na.rm = T)
-
gse93819
예제 데이터 두 그룹으로 나누고 각 유전자별 평균
myexp <- read.csv("https://raw.githubusercontent.com/greendaygh/kribbr2022/main/examples/gse93819_expression_values.csv", header=T)
https://raw.githubusercontent.com/greendaygh/kribbr2022/main/examples/gse93819_expression_values.csv
위 주소 그대로 복사 후 브라우저 주소창에 입력
CTRL + A 로 모두 선택 후 CTRL+C 복사
Rstudio > File > New file > Text file 생성 후 붙여넣기
CTRL + S "myexample1.csv"로 저장
- 발현데이터를 둘로 나누고 각 그룹의 유전자 발현 평균 구하기
myexp <- read.csv("myexample1.csv", header=T)
str(myexp)
head(myexp)
myexp1 <- myexp[,1:10]
myexp2 <- myexp[,11:20]
myexp1mean <- apply(myexp1, 1, mean)
myexp2mean <- apply(myexp2, 1, mean)
myexpmean <- cbind(myexp1mean, myexp2mean)
myexpmean
- cbind 주의점: 이름이 달라도 병합이 되고 이런 문제 때문에 dplyr
join
사용
head(names(myexp1mean))
head(names(myexp2mean))
head(myexpmean)
names(myexp1mean)[1] <- "myexp"
head(names(myexp1mean))
head(names(myexp2mean))
myexpmean2 <- cbind(myexp1mean, myexp2mean)
head(myexpmean2)
plot(myexpmean)
class(myexpmean)
str(myexpmean)
df <- as.data.frame(myexpmean)
str(df)
plot(df)
mydiff <- df$myexp1mean - df$myexp2mean
mydiff
hist(mydiff, br=100)
- airquality long 형 변환
- pipe operator short cut: SHift + ctrl + m
library(dplyr)
library(tidyr)
data(airquality)
airquality %>% head
airquality %>% str
airquality %>%
pivot_longer(c(Ozone, Solar.R, Wind, Temp))
myexpmeandf <- as.data.frame(myexpmean)
myexpmeandf %>% str
myexpmeandf %>% head
myexpmeandf %>%
pivot_longer(c(myexp1mean, myexp2mean))
babies %>% str
newbabies <- babies %>%
dplyr::select(id, age, gestation, wt, dwt, smoke)
newbabies %>% str
newbabies %>%
filter(gestation != 999 & dwt != 999)
Class 2
- random sequence 생성 grep, grepl 실습
mydna <- sample(c("A", "C", "G", "T"), 20, replace = T)
mydna <- paste(mydna, collapse = "")
n <- 100
mydna <- rep("", n)
mydna
## case1
for(i in 1:n){
mydna[i] <- paste(sample(c("A", "C", "G", "T"), 20, replace = T),
collapse = "")
}
mydna
## case2
mydna <- rep("", n)
for(i in 1:n){
mydna[i] <- sample(c("A", "C", "G", "T"), 20, replace = T) %>%
paste(collapse = "")
}
mydna
## case3
mydna <- sapply(1:n, function(x){
sample(c("A", "C", "G", "T"), 20, replace = T) %>%
paste(collapse = "")
})
mydna
## return index
grep("ATG", mydna)
grepl("ATG", mydna)
grep("Se", colnames(iris))
colnames(iris)[grep("Se", colnames(iris))]
- dplyr의 주요 함수들과 같이 사용되는 helper function
#newbabies %>%
# filter(man1 != 999 & man2 != 999.. ) x
#newbabies$gestation == 999
newbabies %>%
filter(!if_any(.fns = function(x){x==999}))
- NA 제외 후 새로 데이터 생성
is.na(airquality$Ozone)
airquality$Ozone
mynewdata <- airquality %>%
filter(!if_any(.fns=is.na))
mynewdata
- arrange 기존 방법과 비교
mynewdata$Ozone
sort(mynewdata$Ozone)
i <- order(mynewdata$Ozone) # return index
i
mynewdata[i,]
mynewdata %>%
arrange(Ozone)
- mutate 새로운 변수 추가 가능
class(myexpmean)
df <- as.data.frame(myexpmean)
class(df)
newdf <- df %>%
mutate(diff = sqrt((myexp1mean-myexp2mean)^2))
newdf$diff > mean(newdf$diff)
newdf <- df %>%
mutate(diff = sqrt((myexp1mean-myexp2mean)^2)) %>%
mutate(diffl = diff>mean(diff))
newdf %>%
filter(diffl==T)
- summarise 를 이용해서 타입별 평균과 표준편차 구하기
iris %>% str
iris %>%
group_by(Species) %>%
summarise(Sepal.Length.mean = mean(Sepal.Length),
Sepal.Width.mean = mean(Sepal.Width))
irismean <- iris %>%
group_by(Species) %>%
summarise(across(.fns=mean))
irissd <- iris %>%
group_by(Species) %>%
summarise(across(.fns=sd))
-
cbind
대신join
사용
df1 <- data.frame(id=c(1,2,3,4,5,6), age=c(30, 41, 33, 56, 20, 17))
df2 <- data.frame(id=c(4,5,6,7,8,9), gender=c("f", "f", "m", "m", "f", "m"))
inner_join(df1, df2, by="id")
left_join(df1, df2, "id")
right_join(df1, df2, "id")
full_join(df1, df2, "id")
- ggplot mean
irismean
irissd
irisplot <- irismean %>%
pivot_longer(-Species)
ggplot(irisplot, aes(x=Species, y=value, fill=name)) +
geom_bar(stat="identity", position="dodge")
- ggplot error bar
d1 <- irismean %>%
pivot_longer(-Species)
d2 <- irissd %>%
pivot_longer(-Species)
df <- left_join(d1, d2, by=c("Species", "name"))
df
ggplot(df, aes(x=Species, y=value.x, fill=name)) +
geom_bar(stat="identity", position="dodge") +
geom_errorbar(aes(ymin=value.x-value.y, ymax=value.x+value.y),
position=position_dodge(width=0.9),
width=0.4)
-
gse93819
를 4개 그룹으로 나누고 각 그룹별 평균, 표준편차 구하고 barplot 그리기 - 산포도 그리고 평균보다 2배 이상 차이나는 유전자를 골라서 색을 다르게 그리기기
tmpid <- rep(c("A", "B", "C", "D"), each=5)
groupid <- paste(tmpid, 1:5, sep="")
groupid
colnames(myexp) <- groupid
data_meana <- myexp %>%
dplyr::select(starts_with("A")) %>%
apply(1, mean)
data_meanb <- myexp %>%
dplyr::select(starts_with("B")) %>%
apply(1, mean)
data_sda <- myexp %>%
dplyr::select(starts_with("A")) %>%
apply(1, sd)
data_sdb <- myexp %>%
dplyr::select(starts_with("B")) %>%
apply(1, sd)
mydata <- data.frame(data_meana,
data_meanb,
data_sda,
data_sdb)
mydata
## for the rownames_to_column()
library(tibble)
mydataplot <- mydata %>%
rownames_to_column() %>%
pivot_longer(-rowname)
d1 <- mydataplot %>%
filter(grepl("data_mean",name)) %>%
slice(1:100)
d2 <- mydataplot %>%
filter(grepl("data_sd",name)) %>%
slice(1:100)
ggplot(d1, aes(x=rowname, y=value, fill=name)) +
geom_bar(stat="identity", position="dodge")
19.4 Lecture 05 (0707)
Class 01
- 수업 전 확인사항
tidyverse
로딩 –> 권한 문제로 인한 설치 에러는 조사 후 업데이트 예정 - 아래
https://greendaygh.github.io/kribbr2022/
코드 다운로드 - basic
library(tidyverse)
myexp <- read.csv("https://raw.githubusercontent.com/greendaygh/kribbr2022/main/examples/gse93819_expression_values.csv", header=T)
- R재설치
- r-project.org > CRAN > korea (제일 마지막) > download R for windows > base > download R 최신버전
- 디렉토리 실행 설치 (디폴트 옵션)
- Rstudio > Tools > Global options >
[Default] [64-bit] C:\Program Files\R\R-4.2.1
선택 > Rstudio 재실행 - install tidyverse
- ggplot 사용법
library(ggplot2)
head(iris)
str(iris)
ggplot(data=iris) +
geom_point(mapping=aes(x=Petal.Length, y=Petal.Width))
ggplot(data=iris, mapping=aes(x=Petal.Length, y=Petal.Width)) +
geom_point()
- aes 옵션은 다음 여섯 가지 주로 사용 x, y, group, color, fill, shape
- aes 에 쓰인 옵션은 다른 그룹의 데이터일 경우 다른 모양으로 (컬러로) 표현 한다는 의미
ggplot(iris, aes(x=Petal.Length,
y=Petal.Width,
color=Species,
shape=Species)) +
geom_point(color="black")
- bargraph
- geom_bar 기본 설정 stat = “count”
dat <- data.frame(x1=rnorm(100))
str(dat)
ggplot(dat, aes(x=x1)) +
geom_bar()
ggplot(dat, aes(x=x1)) +
geom_bar(stat="bin", bins=30)
x1 <- rnorm(10)
x2 <- rnorm(10)
dat <- data.frame(x1, x2)
ggplot(dat, aes(x=x1, y=x2)) +
geom_bar(stat="identity")
- geom_xx 사용하는 어떤 레이어에서건 data, aes
ggplot(dat, aes(x=x1, y=x2)) +
geom_bar(stat="identity") +
geom_point(aes(col="red", size=5))
- aes 안에서 정의된 옵션은 guide 붙음
ggplot(dat, aes(x=x1, y=x2)) +
geom_bar(stat="identity") +
geom_point(col="red", size=5)
- 이산형 데이터 –> barplot
- 연속형 –> histogram (범위 지정 필요)
x1 <- as.factor(1:3)
y1 <- tabulate(sample(x1, 100, replace=T))
dat <- data.frame(x1, y1)
ggplot(dat, aes(x=x1, y=y1)) +
geom_bar()
ggplot(dat, aes(x=x1, y=y1)) +
geom_bar(stat="identity")
ggplot(dat, aes(x=x1, y=y1, fill=x1)) +
geom_bar(stat="identity") +
xlab("Category") +
ylab("Count") +
ggtitle("Barplot") +
guides(fill="none")
Class 02
- 그룹별로 각 유전자의 발현의 평균을 bar 그래프로 비교
- 데이터 기본: row: 하나의 샘플, col: 하나의 변수 (특히 data.frame 형태)
myexp <- read.csv("https://github.com/greendaygh/kribbr2022/raw/main/examples/gse93819_expression_values.csv", header=T)
myexp
- Case1 (고전적 프로그래밍 방법)
- for문 사용 (느림), 저장공간 미리 준비
- 각 데이터들마다 동일한 코드 반복, 데이터 바뀔시 재사용성 낮음
group1data <- as.matrix(myexp[,c(1:5)])
##myexp[,c(6:10)]
mymean1 <- rep(0, 5)
for(i in 1:5){
mymean1[i] <- mean(group1data[i,], na.rm=T)
}
- Case 2 (apply)
- 비교적 빠른 처리 가능
- 통계량 계산 후 그래프용 데이터 재구성 필요
- apply 함수는 익숙해질 필요 있음
- Case 3 (tidyverse)
- 가장 지향하는 방식
- 분석 목적에 따른 샘플, 변수, 값 구분하기
- 목적: 그룹별로 각 유전자의 발현의 평균을 bar 그래프로 비교
- 발현의 평균을 계산해야 하므로 유전자가 변수가 (컬럼) 되야함
- 발현 데이터셋 3종: 1) main expression data 2) sample metadata 3) probe metadata (gene info)
mygroup <- rep(c("A", "B", "C", "D"), each=5)
mysample <- data.frame(sample_name=names(myexp), group_name=mygroup)
mysample
myexpt <- t(myexp)
myexpt <- myexp %>%
rownames_to_column() %>%
pivot_longer(cols = -rowname) %>%
pivot_wider(names_from = rowname, values_from = value)
myexpt[1:5, 1:5]
dim(myexpt)
class(myexpt)
myexpt <- data.frame(myexpt)
- 두 데이터셋 통합하기 위해 같은 변수이름 사용 (다른 변수끼리도 통합하는데 기준 변수로 사용 가능)
myexp2 <- myexpt %>%
rownames_to_column(var = "sample_name") %>%
left_join(mysample, by=c("sample_name")) %>%
select(group_name, everything()) %>%
group_by(group_name)
myexp2 %>%
summarise(mean(X1415670_at))
is.numeric(myexp2$X1415670_at)
myexp2 %>%
summarise(across(everything(), mean))
mymean <- myexp2 %>%
summarise(across(where(is.numeric), mean))
mymean
- ggplot 의 입력 데이터는 가능하면 long형 tidy 데이터
- long 형 데이터는 3가지 타입의 컬럼 (id, name, value) 를 가짐 (id 는 두개 이상 가능)
myplotdata <- mymean %>%
pivot_longer(cols = -c("group_name"))
ggplot(myplotdata, aes(x=name, y=value, fill=group_name)) +
geom_bar(stat="identity", position = "dodge") +
scale_x_discrete(limit=c("X1415670_at", "X1415671_at")) +
ylim(-10, 2000)
Class 03
- error bar 그리기
myexp2 <- myexpt %>%
rownames_to_column(var = "sample_name") %>%
left_join(mysample, by=c("sample_name")) %>%
select(group_name, everything()) %>%
group_by(group_name)
mymean <- myexp2 %>%
summarise(across(where(is.numeric), mean))
mysd <- myexp2 %>%
summarise(across(where(is.numeric), sd))
mymean2 <- mymean %>%
pivot_longer(-group_name, values_to = "mean")
mysd2 <- mysd %>%
pivot_longer(-group_name, values_to = "sd")
myplotdata <- mymean2 %>%
left_join(mysd2, by=c("group_name", "name"))
myplotdata$name[1:10]
ggplot(myplotdata, aes(x=name, y=mean, fill=group_name)) +
geom_bar(stat="identity", position = "dodge") +
geom_errorbar(aes(ymin=mean-sd, ymax=mean+sd),
position = position_dodge(width=0.9),
width=0.4,
color="#666666") +
theme_minimal() +
scale_x_discrete(limit=myplotdata$name[1:10]) +
ylim(-10, 3000) +
theme(axis.text.x = element_text(angle=90))
- 연속형 변수는 line graph 가능
x1 <- c(12, 21, 40)
x2 <- c(33, 10, 82)
dat <- data.frame(x1, x2)
ggplot(dat, aes(x=x1, y=x2)) +
geom_line()
- 범주형 변수는 기본적으로 barplot
x1 <- as.factor(c(1:3))
y1 <- c(33, 10, 82)
dat <- data.frame(cate=x1, count=y1)
str(dat)
ggplot(dat, aes(x=cate, y=count, group="a")) +
geom_bar(stat="identity") +
geom_line() +
geom_point(size=5)
- smooth line
ggplot(mtcars, aes(x=mpg, y=hp)) +
geom_point() +
geom_smooth(method = "lm", se=FALSE)
ggplot(mtcars, aes(x=mpg, y=hp)) +
geom_point() +
geom_smooth(span=0.5)
- simulation
weights <- rnorm(200, 75, 5)
heights <- weights + rnorm(200, 100, 5)
classes <- sample(c("A", "B", "C", "D"), size=length(heights), replace = T)
mydata <- data.frame(heights, weights, classes)
str(mydata)
- 몸무게와 키의 산포도를 그리고 반별로 어떻게 상관성이 다른지 그리고 전체적으로 어떻게 다른지 알 수 있는 그래프를 그리시오
ggplot(mydata, aes(x=heights, y=weights)) +
geom_point(aes(color=classes)) +
geom_smooth()
Class 04
- 두 그룹의 프루브들에 대해서 산포도를 그리기
myexp2 <- myexpt %>%
rownames_to_column(var = "sample_name") %>%
left_join(mysample, by=c("sample_name")) %>%
select(group_name, everything()) %>%
group_by(group_name)
mymean <- myexp2 %>%
summarise(across(where(is.numeric), mean))
- transpose (더 효율적 방법 찾아보기)
- 그룹별 산포도
mymeant <- t(mymean)
class(mymeant)
mymeant <- data.frame(t(mymean)[-1,])
names(mymeant) <- mymean$group_name
myplotdata <- mymeant %>%
dplyr::select(A, B) %>%
mutate(Anum = as.numeric(A), Bnum = as.numeric(B)) %>%
dplyr::select(Anum, Bnum)
ggplot(myplotdata, aes(x=Anum, y=Bnum)) +
geom_point()
ggplot(myplotdata, aes(x=Anum, y=Bnum)) +
geom_point() +
scale_x_continuous(trans = "log") +
scale_y_continuous(trans = "log") +
geom_smooth(method = "lm") +
geom_abline(intercept = 0, slope = 1, color="red", size=2)
mymeant %>%
dplyr::select(A, C) %>%
mutate(X = as.numeric(A), Y = as.numeric(C)) %>%
dplyr::select(X, Y) %>%
ggplot(aes(x=X, y=Y)) +
geom_point() +
scale_x_continuous(trans = "log") +
scale_y_continuous(trans = "log") +
geom_smooth(method = "lm") +
geom_abline(intercept = 0, slope = 1, color="red", size=2)
mymeant %>%
dplyr::select(A, D) %>%
mutate(X = as.numeric(A), Y = as.numeric(D)) %>%
dplyr::select(X, Y) %>%
ggplot(aes(x=X, y=Y)) +
geom_point() +
scale_x_continuous(trans = "log") +
scale_y_continuous(trans = "log") +
geom_smooth(method = "lm") +
geom_abline(intercept = 0, slope = 1, color="red", size=2)
- facet
- R에서 공식 표현하는 방법 y = ax + b, R expression: y ~ x
ggplot(iris, aes(x=Petal.Length, y=Petal.Width)) +
geom_point(aes(color=Species, shape=Species)) +
facet_wrap(~Species, nrow=2)
- iris 예제
str(iris)
mycate <- factor(sample(c(0,1), nrow(iris), replace=T))
myiris <- data.frame(iris, mycate)
str(myiris)
ggplot(myiris, aes(x=Petal.Length, y=Petal.Width)) +
geom_point(aes(color=Species, shape=Species)) +
facet_grid(mycate~Species)
- plot 을 변수에 저장할 수 있음
myplot <- ggplot(myiris, aes(x=Petal.Length, y=Petal.Width)) +
geom_point(aes(color=Species, shape=Species)) +
facet_grid(mycate~Species) +
labs(x='Four classes',
y='Number of students',
title='Blood type distribution',
subtitle = 'Blood type distribution from the 200 students',
fill='Blood Types')
myplot +
theme_bw() +
#scale_color_brewer(palette="YlGnBu")
scale_fill_manual(values = c("orange", "skyblue", "royalblue", "blue"))
- A, B, C, D 서로간의 비교에 대한 산포도 여러개 켄버스에 그리기
mydat <- mymeant %>%
slice(1:1000) %>%
mutate(X1 = as.numeric(A),
X2 = as.numeric(B),
X3 = as.numeric(C),
X4 = as.numeric(D)) %>%
dplyr::select(starts_with("X"))
tmp1 <- mydat %>%
select(x=X1, y=X2) %>%
mutate(group_name = "AB")
tmp2 <- mydat %>%
select(x=X1, y=X3) %>%
mutate(group_name = "AC")
tmp3 <- mydat %>%
select(x=X1, y=X4) %>%
mutate(group_name = "AD")
tmp4 <- mydat %>%
select(x=X2, y=X3) %>%
mutate(group_name = "BC")
tmp5 <- mydat %>%
select(x=X3, y=X4) %>%
mutate(group_name = "CD")
tmp1 %>%
bind_rows(tmp2) %>%
bind_rows(tmp3) %>%
bind_rows(tmp4) %>%
bind_rows(tmp5) %>%
ggplot(aes(x=x, y=y)) +
geom_point() +
scale_x_continuous(trans = "log") +
scale_y_continuous(trans = "log") +
geom_smooth(method = "lm") +
geom_abline(intercept = 0, slope = 1, color="red", size=2) +
facet_wrap(~group_name)
19.5 Lecture 06 (0714)
Class 01
- 확인사항
- package install (tidyverse) 가능여부 확인, 설치가 되지 않을경우 (https://cran.rstudio.com/ blablabla 접근 에러) 아래와 같이
options
에 repos 변수 CRAN을 “http://” 로 바꿔줌
- package install (tidyverse) 가능여부 확인, 설치가 되지 않을경우 (https://cran.rstudio.com/ blablabla 접근 에러) 아래와 같이
options(repos = c(CRAN="http://cran.rstudio.com"))
install.packages("tidyverse")
- dbplyr 설치 에러가 발생할 경우 R 재설치 (https://greendaygh.github.io/kribbr2022/lecture-5-note.html)
- 아래 데이터 download 되는지 확인
library(tidyverse)
myexp <- read.csv("https://raw.githubusercontent.com/greendaygh/kribbr2022/main/examples/gse93819_expression_values.csv", header=T)
-
- 한글로된 디렉토리 이름 사용하지 않기
- D 또는 C 드라이브에 영문 디렉토리 Rstudy 하나 만들기
- Rstudio > File > new project > .. > “Rstudy” 디렉토리 설정 > “lecture6” 프로젝트이름 입력
- working directory 가 D:/Rstudy/lecture6 확인, Rmarkdown 파일 하나 생성
- boxplot + violin 그리기
library(tidyverse)
#install.packages("viridis")
library(viridis)
# create a dataset
mydata <- data.frame(name = c(rep("A", 100), rep("B", 100)),
value= c(rnorm(100, 10, 1), rnorm(100, 12, 1)))
mydata
ggplot(mydata, aes(x=name, y=value, fill=name)) +
geom_violin() +
geom_boxplot(width=0.2)
mylab <- mydata %>%
group_by(name) %>%
summarise(num = n())
mylab <- paste(mylab$name, "\n n=", mylab$num, sep="")
ggplot(mydata, aes(x=name, y=value, fill=name)) +
geom_violin() +
geom_boxplot(width=0.2) +
scale_x_discrete(labels = mylab) +
scale_fill_viridis(discrete = TRUE) +
theme(legend.position="none")
- boxplot for expression data
myexp <- read.csv("https://raw.githubusercontent.com/greendaygh/kribbr2022/main/examples/gse93819_expression_values.csv", header=T)
myplotdata <- myexp %>%
pivot_longer(cols = everything())
ggplot(myplotdata, aes(x=name, y=value)) +
geom_boxplot()
- scale 패키지의
comma
를 사용해서 천단위 수 표현
library(scales)
ggplot(myplotdata, aes(x=name, y=value, fill=name)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE) +
scale_y_continuous(trans="log",
breaks = c(0.1, 20, 1000),
labels = comma) +
theme(axis.text.x = element_text(angle=90),
legend.position = "none") +
labs(title = "baplot",
subtitle = "subtitle",
x = "xlab",
y = "ylab")
Class 02
- 산포도
library(UsingR)
#install.packages("UsingR")
mydata <- data.frame(mpg$displ, mpg$cty)
mpg %>%
dplyr::select(displ, cty) %>%
ggplot(aes(x=cty, y=displ)) +
geom_point(position = "jitter")
mpg
- 추세선
mydata <- mpg %>%
dplyr::select(displ, cty, hwy) %>%
pivot_longer(cols = c(cty, hwy))
ggplot(mydata, aes(x=displ, y=value, color=name)) +
geom_point(position = "jitter") +
scale_color_manual(values = c("#ff0000", "blue"),
labels = c("a", "b")) +
geom_smooth(span=1) +
labs(title="Cars") +
theme(
title = element_text(size=20),
axis.title = element_text(size=12),
legend.title = element_text(size=12)
)
-
scale_size
로 point 특성 변경
mydata
ggplot(mydata, aes(x=displ, y=value, size=value, color=name)) +
geom_point(position = "jitter") +
scale_size(range = c(0.5, 8), guide=F) +
scale_color_viridis(alpha=0.4, discrete = T) +
theme_minimal()
ggplot(mydata, aes(x=displ, y=value, size=value, fill=name)) +
geom_point(position = "jitter", shape=21) +
scale_size(range = c(0.5, 8), guide=F) +
scale_fill_viridis(alpha=0.4, discrete = T) +
theme_bw()
Class 03
- expression data 두 그룹의 평균 발현양 비교
- x: normal group mean vs. y: treatment group mean
- plot: scatter plot
- 차이나는 유전자를 더 크게 (더 밝게) 그리기
myexp2 <- myexp[,1:10]
mygrp <- c(rep("N", 5), rep("T", 5))
samplemeta <- data.frame(sample_name = names(myexp2), group_name=mygrp)
samplemeta
- dplyr 패키지 (pivot_wider) 사용한 데이터 transpose
tmp <- t(myexp2)
dim(tmp)
tmp[1:10, 1:10]
class(tmp)
tmp[,1]
myexp2t <- myexp2 %>%
rownames_to_column() %>%
pivot_longer(cols = -rowname) %>%
#pivot_wider(names_from = name, values_from = value)
pivot_wider(names_from = rowname, values_from = value)
merdata <- samplemeta %>%
left_join(myexp2t, by=c("sample_name" = "name"))
- stat
# mymean <- merdata %>%
# group_by(group_name) %>%
# summarise(across(where(is.numeric), mean))
# mysd <- merdata %>%
# group_by(group_name) %>%
# summarise(across(where(is.numeric), sd))
#
# mymean2 <- mymean %>%
# pivot_longer(-group_name)
#
# mysd2 <- mysd %>%
# pivot_longer(-group_name)
#
# mydata <- mymean2 %>%
# left_join(mysd2, by=c("group_name", "name"))
- 두 그룹 각 유전자 발현의 평균 구하고 산포도 그리기
mymean <- merdata %>%
group_by(group_name) %>%
summarise(across(where(is.numeric), mean))
mymean2 <- mymean %>%
pivot_longer(-group_name) %>%
pivot_wider(names_from=group_name, values_from = value)
ggplot(mymean2, aes(x=N, y=T)) +
geom_point()
- t-test 통한 pvalue 구하기
merdata %>%
group_by(group_name) %>%
summarise(across(where(is.numeric), mean))
mean(c(1:10))
tmp <- t.test(1:10, 2:11)
class(tmp)
#?help
names(tmp)
#attributes(tmp)
#attributes(tmp)$names
tmp$statistic
tmp$p.value
mytest <- function(x){
t.test(x[1:5], x[6:10])$p.value
}
merdata %>%
dplyr::select(-c("sample_name", "group_name")) %>%
apply(2, function(x){t.test(x[1:5], x[6:10])$p.value})
mypvalue <- merdata %>%
dplyr::select(-c("sample_name", "group_name")) %>%
apply(2, mytest)
myplotdata <- data.frame(mypvalue) %>%
rownames_to_column() %>%
left_join(mymean2, by = c("rowname" = "name"))
myplotdata %>%
mutate(mlogp = -log(mypvalue)) %>%
ggplot(aes(x=N, y=T, size=mlogp)) +
geom_point(shape=21, fill="black", alpha=0.2) +
scale_x_continuous(trans = "log", breaks = c(1, 20, 400, 8000)) +
scale_y_continuous(trans = "log", breaks = c(1, 20, 400, 8000))
myplotdata %>%
mutate(mlogp = -log(mypvalue)) %>%
ggplot(aes(x=N, y=T, size=mlogp, fill=mlogp)) +
geom_point(shape=21) +
scale_x_continuous(trans = "log", breaks = c(1, 20, 400, 8000)) +
scale_y_continuous(trans = "log", breaks = c(1, 20, 400, 8000)) +
scale_fill_viridis(alpha=0.2)
- 에러바 있는 barplot
mymean <- merdata %>%
group_by(group_name) %>%
summarise(across(where(is.numeric), mean)) %>%
pivot_longer(-group_name)
mysd <- merdata %>%
group_by(group_name) %>%
summarise(across(where(is.numeric), sd)) %>%
pivot_longer(-group_name)
mydata <- mymean %>%
left_join(mysd, by=c("group_name", "name"))
ggplot(mydata, aes(x=name, y=value.x, fill=group_name)) +
geom_bar(stat="identity", position="dodge", color="black") +
scale_x_discrete(limits = c("1415670_at", "1415671_at")) +
scale_y_continuous(limits = c(0, 3000)) +
geom_errorbar(aes(ymin=value.x, ymax=value.x+value.y),
position = position_dodge(width=0.9),
width = 0.2) +
scale_fill_viridis(discrete = T, option = "D")
- 옆으로 눕히기
ggplot(mydata, aes(x=name, y=value.x, fill=group_name)) +
geom_bar(stat="identity", position="dodge", color="black") +
scale_x_discrete(limits = c("1415670_at", "1415671_at")) +
scale_y_continuous(limits = c(0, 3000)) +
geom_errorbar(aes(ymin=value.x, ymax=value.x+value.y),
position = position_dodge(width=0.9),
width = 0.2) +
scale_fill_viridis(discrete = T, option = "D") +
coord_flip()
ggplot(mydata, aes(x=name, y=value.x, fill=group_name)) +
geom_bar(stat="identity", position="dodge", color="black") +
scale_x_discrete(limits = c("1415670_at", "1415671_at")) +
scale_y_continuous(limits = c(0, 3000)) +
geom_errorbar(aes(ymin=value.x, ymax=value.x+value.y),
position = position_dodge(width=0.9),
width = 0.2) +
scale_fill_viridis(discrete = T, option = "D") +
coord_polar(start = 0)
Class 04
- heat map
val <- mpg$class
num <- 10
df <- expand.grid(y = 1:num, x = 1:num)
df
categ_table <- round(table(val) * ((num*num)/(length(val))))
categ_table
df$category <- factor(rep(names(categ_table), categ_table))
df
ggplot(df, aes(x=x, y=y, fill=category)) +
geom_tile(color="black", size=0.5)
- myexp 데이터로 heatmap
myexp2 <- myexp %>%
rownames_to_column() %>%
slice(1:30) %>%
pivot_longer(-rowname, names_to = "sample_name")
ggplot(myexp2, aes(x=rowname, y=sample_name, fill=value)) +
geom_tile(color="gray", size=0.2) +
theme(
axis.text.x = element_text(angle=90)
) +
scale_fill_viridis_c()
- density plot
myexp %>%
dplyr::select("GSM2462948", "GSM2462949") %>%
rownames_to_column() %>%
pivot_longer(-rowname) %>%
ggplot(aes(x=value, fill=name)) +
geom_histogram() +
scale_x_continuous(trans="log")
myplotdata <- myexp %>%
dplyr::select("GSM2462948", "GSM2462949") %>%
rownames_to_column() %>%
pivot_longer(-rowname)
ggplot(myplotdata, aes(x=value)) +
geom_density(aes(fill=factor(name)), alpha=0.4) +
scale_x_continuous(trans = "log") +
scale_fill_viridis(discrete = T) +
theme_bw()
#install.packages("ggridges")
- ggridges
library(ggridges)
ggplot(myplotdata, aes(x=value)) +
geom_density_ridges(aes(fill=factor(name)), alpha=0.4) +
scale_x_continuous(trans = "log") +
scale_fill_viridis(discrete = T) +
theme_bw()
19.6 Lecture 07 (0804)
class 01
- 프로젝트 만들기
- Rstudio 실행
- File > New project > New directory > New project
- 디렉토리
d:\rstudy
지정 - 디렉토리 이름을 lecture7 입력 후 create project
- Rmarkdown 만들기
- File > New file > R markdown > 기본값 OK
- title, output 남기고 다 지우기
- Ctrl + S 파일 저장
- Bioconductor 패키지 설치
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("IRanges")
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("Homo.sapiens")
- (참고) 코드청크에서는 커서를 코드에 두고 Ctrl + Enter 누르면 실행
library(IRanges)
vignette("IRanges")
browseVignettes("IRanges")
ir1 <- IRanges(start=1:10, width=10)
ir1
i
- OOP 설명
df <- data.frame(x=c(1:5), y=LETTERS[1:5])
class(df)
class(df) <- c("data.frame", "myclass")
class(df)
- class에 따른 method 작성 예제, function 참고
x <- 1:10
y <- c("A", "B", "C", "A", "B")
class(x)
class(y)
mean(x)
table(y)/length(y)
mysummry <- function(z){
if(class(z)=="integer"){
retval <- mean(z)
}else if(class(z)=="character"){
retval <- table(z)/length(z)
}
return(retval)
}
mysummry(x)
mysummry(y)
?summary
- 패키지 사용법 모를 경우: class 확인, 해당 class에 사용되는 methods 확인
library(Homo.sapiens)
class(Homo.sapiens)
?OrganismDb
methods(class="OrganismDb")
mygenes <- genes(Homo.sapiens)
class(mygenes)
columns(Homo.sapiens)
mygenes[1:10]
myexons <- exons(Homo.sapiens)
?GRanges
methods(class="GRanges")
gaps(myexons)
class 02
- Biostrings 설치
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("Biostrings")
library(Biostrings)
x <- DNAString("ATGC")
x <- "ATGC"
x2 <- DNAString(x)
class(x2)
x2[2]
x <- DNAStringSet(c("ATTT", "CCCTA"))
x
?DNAStringSet
x[1]
class(x[1])
x[[1]]
class(x[[1]])
- 내장 변수
DNA_BASES
AA_ALPHABET
- DNA 서열 만들기
x <- sample(DNA_BASES, 10, replace = T)
y <- paste(x, collapse = "")
DNAString(y)
y
reverseComplement(DNAString(y))
?reverseComplement
y2 <- DNAString(y)
letterFrequency(y2, c("G", "C"), as.prob=TRUE)
- 랜덤 DNA 생성 연습
- pipe operator 참고
library(tidyverse)
x0 <- paste(sample(DNA_BASES, 10, replace = T), collapse = "")
x0 <- sample(DNA_BASES, 10, replace = T) %>%
paste(collapse = "")
x <- paste("ATG", x0, "TAG", sep="") %>%
DNAString
x
- DNAstringset 설명
x0 <- c("CTC-NACCAGTAT", "TTGA", "TACCTAGAG")
x <- DNAStringSet(x0)
x
length(x)
width(x)
nchar(x)
x
- 10개 DNAstring 만들기, 함수의 생성과 사용
- function 참고
x0 <- sample(DNA_BASES, 30, replace = T) %>%
paste(collapse = "") %>%
paste("ATG", ., "TAG", sep="") %>%
DNAStringSet
x1 <- sample(DNA_BASES, 30, replace = T) %>%
paste(collapse = "") %>%
paste("ATG", ., "TAG", sep="") %>%
DNAStringSet
x2 <- sample(DNA_BASES, 30, replace = T) %>%
paste(collapse = "") %>%
paste("ATG", ., "TAG", sep="") %>%
DNAStringSet
c(x0, x1, x2)
random_dna <- function(x){
z <- sample(DNA_BASES, x, replace = T) %>%
paste(collapse = "") %>%
paste("ATG", ., "TAG", sep="") %>%
DNAStringSet
return(z)
}
random_dna(30)
-
apply
류 (replicate
) 함수를 이용한 함수의 반복적인 사용 - apply 참고
for(i in 1:10){
cat(i)
}
z <- rep("", 10)
for(i in 1:10){
z[i] <- random_dna(30)
}
DNAStringSet(z)
z <- replicate(10, random_dna(30))
z2 <- do.call(c, z)
z2
names(z2) <- paste("dna", 1:length(z2), sep="")
z2
- ggplot 활용 barplot 그리기
gcratio <- letterFrequency(z2, c("G", "C"), as.prob=TRUE) %>%
rowSums()
df <- data.frame(names=names(z2), gcratio)
df
barplot(df$gcratio)
ggplot(df, aes(x=names, y=gcratio, fill=names)) +
geom_bar(stat="identity") +
scale_y_continuous(limits = c(0, 1)) +
scale_fill_brewer(palette = "green") +
theme_bw() +
labs(title="GC ratio") +
xlab("DNA") +
ylab("GC ratio")
- successiveViews 활용한 서열 보기
class(z2)
z2[1]
Views(z2[[2]], start=1, end=10)
Views(z2[[2]], start=1, width=10)
successiveViews(z2[[2]], width=c(10, 10, 10, 10))
successiveViews(z2[[2]], width=rep(10, 4))
#successiveViews(z2[[2]], width=rep(length(z2[[2]]), 4))
x <- random_dna(994)
successiveViews(x[[1]], width=rep(40, nchar(x[[1]])/40))
class 03
- DNA sequence read and write
names(x) <- "mysequence"
writeXStringSet(x, filepath = "myseq.fasta", format = "fasta")
mynewseq <- readDNAStringSet(filepath = "myseq.fasta")
mynewseq
z2
writeXStringSet(z2, filepath = "z2.fasta")
readDNAStringSet(filepath = "z2.fasta")
- yeast chr1 ORF 추정
- orfinder, ncbi
data(yeastSEQCHR1)
yeastSEQCHR1
nchar(yeastSEQCHR1)
yeast1 <- DNAStringSet(yeastSEQCHR1)
writeXStringSet(yeast1, filepath = "yeast1.fasta")
- yeast chr1 코돈 비율 계산
yeast1orf <- readDNAStringSet(filepath = "yeast1.cds")
yeast1orf
tricodon <- trinucleotideFrequency(yeast1orf)
whole_codon <- colSums(tricodon)
class(whole_codon)
df <- data.frame(whole_codon) %>%
rownames_to_column()
ggplot(df, aes(x=rowname, y=whole_codon)) +
geom_bar(stat="identity") +
scale_y_continuous() +
#scale_fill_brewer(palette = "green") +
theme_bw() +
labs(title="Yeast chr1 codon frequency") +
xlab("Codon") +
ylab("Codon frequency") +
theme(
axis.text.x = element_text(angle=90)
)
class 04
- 코돈 아미노산 변환, barplot 그리기
- data analysis with tidyverse
df
GENETIC_CODE
AMINO_ACID_CODE
GENETIC_CODE[1]
GENETIC_CODE["TTT"]
GENETIC_CODE[df$rowname]
df2 <- df %>%
mutate(AA1 = GENETIC_CODE[rowname]) %>%
mutate(AA3 = AMINO_ACID_CODE[AA1]) %>%
group_by(AA3) %>%
summarise(val=sum(whole_codon))
df2
ggplot(df2, aes(x=AA3, y=val)) +
geom_bar(stat="identity") +
#scale_y_continuous() +
#scale_fill_brewer(palette = "green") +
theme_bw() +
labs(title="Yeast chr1 AA frequency") +
xlab("AA") +
ylab("AA frequency") +
theme(
axis.text.x = element_text(angle=90)
)
- matchpattern 사용하기
class(yeast1[[1]])
hits <- matchPattern("ATG", yeast1[[1]], min.mismatch=0, max.mismatch=0)
hits
class(z2)
hits <- vmatchPattern("ATG", z2, min.mismatch=0, max.mismatch=0)
hits
19.7 Lecture 08 (0811)
class 01
-
rentrez
패키지를 사용한 NCBID 데이터 다운로드
library(rentrez)
entrez_dbs()
entrez_db_summary("nuccore")
covid_paper <- entrez_search(db="pubmed", term="covid-19", retmax=40)
class(covid_paper)
methods(class="esearch")
names(covid_paper)
covid_paper$ids
katipo_search <- entrez_search(db="popset", term="Latrodectus katipo[Organism]")
class(katipo_search)
names(katipo_search)
katipo_search$ids
entsum <- entrez_summary(db="popset", id = katipo_search$ids)
entsum[[2]]$title
entsum[[2]]$article
extract_from_esummary(entsum, "title")
- 텍스트 형태로 다운로드 받으므로 파일에 쓰고
readDNAString
형태로 다시 읽어옴
library(Biostrings)
coi <- entrez_fetch(db="popset", id=katipo_search$ids, rettype="fasta")
write(coi, file = "coi.fasta")
coiseq <- readDNAStringSet(filepath = "coi.fasta")
- 뎅기바이러스 예제
dv <- c("NC_001477", "NC_001474", "NC_001475", "NC_002640")
dvseq <- entrez_fetch(db="nuccore", id=dv, rettype = "fasta")
dvseq
write(dvseq, file="dvseq.fasta")
dvseq <- readDNAStringSet("dvseq.fasta")
dvseq
strwrap(as.character(dvseq[[1]]), width=10)
- Covid-19 예제
sresult <- entrez_search(db="popset", term="Covid-19", retmax=40)
class(sresult)
names(sresult)
sresult$ids
mysummary <- entrez_summary("popset", sresult$ids)
mysummary
#lapply
#sapply
mysummary
extract_title <- function(x){
return(x$title)
}
mysummary[[1]]$title
extract_title(x=mysummary[[1]])
sapply(mysummary, extract_title)
tmp <- extract_from_esummary(mysummary, "title")
tmp
class(tmp)
mydata <- data.frame(title=tmp)
mydata
## sequence download
tmpstr <- entrez_fetch("popset", id=sresult$ids, rettype = "fasta")
write(tmpstr, "covid19.fasta")
covid19seq <- readDNAStringSet("covid19.fasta")
covid19seq
names(covid19seq)
class 02
-
popset
데이터베이스에서 선택한 각 아이템 (스터디별) 사용한 샘플 수 구하기 -
sapply
및 사용자 정의 함수extract_statistics_count
사용한 barplot 그리기
library(tidyverse)
mydatadf <- mydata %>%
rownames_to_column(var = "uid")
mysummary$`2279969783`$statistics$count[4]
mysummary$`2271163415`$statistics$count[4]
mysummary$`2252835834`$statistics$count[4]
## ..
sapply(mysummary, extract_statistics_count)
## build function
extract_statistics_count <- function(x){
## x == mysummary$`2279969783`
z <- x$statistics$count[4]
return(z)
}
cnt <- sapply(mysummary, extract_statistics_count)
cntdf <- data.frame(cnt) %>%
rownames_to_column("uid")
# mydata %>%
# rownames_to_column(var = "uid") %>%
# left_join(y=cnt, by="uid")
mydatadf
cntdf
mydata2 <- left_join(y=mydatadf, x=cntdf, by="uid")
## barplot
ggplot(mydata2, aes(x=uid, y=cnt)) +
geom_bar(stat = "identity") +
theme(
axis.text.x = element_text(angle=90)
) +
ylab("Count")
- 코로나 바이러스 ID들에 대해서 정렬을 수행하고
writePairwiseAlignments
를 이용한 가시화
covid <- data.frame(
species = c(rep("Human", 7), c("Civet", "Civet"), rep("Bat", 3), "Pangolin"),
coronavirus = c("SARS-CoV-2", "SARS-CoV-2", "SARS-CoV-1", "SARS-CoV-1", "SARS-CoV-1", "H-CoV-OC43", "MERS-CoV", "SARS-CoV", "SARS-CoV", "SL-CoV", "SL-CoV", "SL-CoV", "SL-CoV"),
isolate = c("Wuhan Hu-1", "USA-WA-1", "Urbani", "Tor2", "GD03T10013", "UK/London", "EMC-2012", "SZ3", "Civet007", "ZXC21", "WIV16", "RaTG13", "MP789"),
year = c("2020", "2020", "2002", "2002", "2003", "2011", "2011", "2003", "2004", "2015", "2013", "2013", "2020"),
gbacc = c("NC_045512.2", "MN985325.1", "AY278741.1", "AY274119.3", "AY525636.1", "KU131570.1", "NC_019843.3", "AY304486.1", "AY572034.1", "MG772934.1", "KT444582.1", "MN996532.1", "MT084071.1"))
write.csv(covid, file = "covid_table.csv", quote = F, row.names=F)
covid19info <- read.csv("covid_table.csv")
str1 <- entrez_fetch("nuccore", covid19info$gbacc, rettype = "fasta")
str2 <- entrez_fetch("nuccore", covid19info$gbacc, rettype = "gb")
write(str1, "covid19seqs.fasta")
write(str2, "covid19seqs.gb")
library(Biostrings)
covid19seqs <- readDNAStringSet("covid19seqs.fasta")
covid19seqs
aln <- pairwiseAlignment(covid19seqs[1], covid19seqs[2])
aln
writePairwiseAlignments(aln, block.width=50)
writePairwiseAlignments(aln, file = "covidaln.txt", block.width=50)
summary(aln)
consensusMatrix(aln)[,1:10]
consensusString(aln)
- (참고) 문자열 나누기 코드,
sapply
와 사용자 정의 함수extract_first_element
사용법 익히기
covid19seqs
names(covid19seqs)
strsplit("AT GC", split=" ")
strsplit(names(covid19seqs)[1], split=" ")
strsplit(names(covid19seqs)[1:2], split=" ")
tmpstr <- strsplit(names(covid19seqs), split=" ")
tmpstr[1:5]
sapply(tmpstr, extract_first_element)
extract_first_element <- function(x){
return(x[1])
}
myacc <- sapply(tmpstr, extract_first_element)
mytitle <- sapply(tmpstr, function(x){
z <- paste(x[-1], collapse=" ")
return(z)
})
mydf <- data.frame(myacc, mytitle)
names(covid19seqs) <- myacc
covid19seqs
aln <- pairwiseAlignment(covid19seqs[1], covid19seqs[2])
writePairwiseAlignments(aln, block.width=50)
class 03
-
DECIPHER
패키지 활용한 다중 서열 정렬
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("DECIPHER")
library(DECIPHER)
-
DECIPHER
패키지 서열 관리
dbConn <- dbConnect(SQLite(), ":memory:")
Seqs2DB(covid19seqs, "XStringSet", dbConn, "covid19")
BrowseDB(dbConn)
coi <- readDNAStringSet("coi.fasta")
Seqs2DB(coi, "XStringSet", dbConn, "coi")
BrowseDB(dbConn)
l <- IdLengths(dbConn)
Add2DB(l, dbConn)
BrowseDB(dbConn)
covid19seq2 <- SearchDB(dbConn, identifier = "covid19")
dbDisconnect(dbConn)
BrowseDB(dbConn)
-
DECIPHER
패키지AlignSeqs
활용한 서열정렬 및BrowseSeqs
이용한 가시화
covid19sub <- subseq(covid19seq2, 1, 2000)
BrowseSeqs(covid19seq2, colWidth = 80, patterns=DNAStringSet(c("ACTG", "CSC")))
aln2 <- AlignSeqs(covid19sub)
BrowseSeqs(aln2, colWidth = 80)
BrowseSeqs(aln2, colWidth = 80, patterns=DNAStringSet(c("ACTG", "CSC")))
BrowseSeqs(aln2, colWidth = 80, patterns="-", colors="black")
-
RESTRICTION_ENZYMES
활용 예제
data(RESTRICTION_ENZYMES)
RESTRICTION_ENZYMES
rsite <- RESTRICTION_ENZYMES["BsaI"]
d <- DigestDNA(rsite, covid19seq2[1])
unlist(d)
pos <- DigestDNA(rsite, covid19seq2[1], type="positions")
unlist(pos)
BrowseSeqs(covid19seq2[1], colWidth = 100, patterns=rsite)
library(stringr)
str_extract_all(rsite, "[[A-Z]]", simplify = T)
rsite2 <- paste(str_extract_all(rsite, "[[A-Z]]", simplify = T), collapse="")
rsite3 <- as.character(reverseComplement(DNAString(rsite2)))
BrowseSeqs(covid19seq2[1], colWidth = 100, patterns=c(rsite2, rsite3))
아래 관련 알아보기
- DECIPHER AA 적용 여부
- DECIPHER --> ggtree
- 웹에서 BLAST 실행 후 결과 데이터 분석
myhit <- read.csv("F9BA23TY01R-Alignment-HitTable.csv", header = F)
myhit
myseq <- readDNAStringSet("seqdump.txt")
myseq
myaln <- AlignSeqs(myseq)
#AAStringSet(myaln)
BrowseSeqs(myaln)
myconm <- consensusMatrix(myaln)
mycons <- consensusString(aln)
dim(myconm)
myconm[1:10, 1:10]
class 04
- ggtree 활용
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("ggtree")
BiocManager::install("tidytree")
BiocManager::install("genbankr")
library(ggtree)
library(tidytree)
tree <- rtree(n = 20)
class(tree)
methods(class="phylo")
ggtree(tree)
names(tree)
ggplot(tree) +
geom_tree() +
theme_tree()
ggtree(tree, branch.length="none")
ggtree(tree, layout="circular") +
geom_tiplab(size=3, aes(angle=angle))
ggtree(tree, layout="circular", branch.length="none") +
geom_tiplab(size=3, aes(angle=angle))
ggtree(tree) +
theme_tree2()
ggtree(tree) +
theme_tree2() +
geom_tiplab() +
geom_hilight(node=23, fill="steelblue", alpha=.4)
as_tibble(tree) ## tidytree
ggtree(tree) +
theme_tree2() +
geom_tiplab() +
geom_hilight(node=23, fill="steelblue", alpha=.4)
dat <- data.frame(node=c(23, 35), type=c("AA", "BB"))
ggtree(tree) +
theme_tree2() +
geom_tiplab() +
geom_hilight(dat, aes(node=node, fill=type), alpha=.4) +
scale_fill_manual(values=c("steelblue", "darkgreen"))
- phylo 클래스 데이터를 tibble 형태로 변환 후 데이터 참고
- genbank 타입 파일에서 정보 읽어오기 (다음시간 계속)
19.8 Lecture 09 (0901)
class 01
- msa
-
NC_045512.2
NCBI 다운로드
library(Biostrings)
library(rentrez)
entrez_dbs()
tmps <- entrez_fetch(db="nuccore", id = "NC_045512.2", rettype = "fasta")
tmps
write.table(tmps, "covid19yu.fasta", quote = F, row.names = F, col.names = F)
download.file(url = "https://raw.githubusercontent.com/greendaygh/kribbr2022/main/covid_table.csv", destfile = "covid_table2.csv")
covid19 <- read.csv("covid_table2.csv")
selid <- covid19$gbacc[1:4]
tmps <- entrez_fetch(db="nuccore", id = selid, rettype = "fasta")
write.table(tmps, "covid19.fasta", quote = F, row.names = F, col.names = F)
- msa 설치
- Bioconductor landing page (구글에서 bioconductor msa)
- installation 코드 복사 후 콘솔에서 붙여넣기
## install
# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
# BiocManager::install("msa")
library(msa)
mycovidseq <- readDNAStringSet("covid19.fasta")
class(mycovidseq)
mycovidseq
- 일부 서열만 가지고 실습
- (팁) 블럭지정 후 ctrl + enter 블럭만 실행
#subseq
#substr
mycovidsubseq <- subseq(mycovidseq, 1, 2000)
aln <- msa(mycovidsubseq, method="Muscle")
aln
DNAStringSet(aln)
?msa
print(aln, show="complete")
#msaPrettyPrint(aln, output = "pdf")
alphabetFrequency(aln)
- IRanges 설치Bioconductor landing page (구글에서 bioconductor IRanges 검색 ) installation 코드 복사 후 콘솔에서 붙여넣기
library(IRanges)
nchar(consensusString(aln))
mym <- IRanges(start=c(1, 73), end=c(36, nchar(consensusString(aln))))
colmask(aln) <- mym
print(aln, show="complete")
alphabetFrequency(aln)
unmasked(aln)
- DECIPHER
library(DECIPHER)
aln <- AlignSeqs(mycovidsubseq)
BrowseSeqs(aln)
BrowseSeqs(aln, colWidth = 100)
class 02
- ggtree (bioconductor), treeio (Bioconductor), ape (CRAN) 설치
- MSA - 2) Distance - 3) clustering - 4) Tree
aln <- msa(mycovidsubseq, method="Muscle")
class(aln)
?stringDist
mydist <- stringDist(DNAStringSet(aln))
class(mydist)
- label 축소
- treeio error 발생시, ggtree 패키지 unload (detach(“package:ggtree”, unload = TRUE)) 후에 treeio, ggtree 순서로 로딩
library(treeio)
library(ggtree)
tmps <- strsplit(names(mycovidsubseq), split=" ")
mynames <- sapply(tmps, function(x){x[1]})
names(mycovidsubseq) <- mynames
mycovidsubseq
aln <- msa(mycovidsubseq)
mydist <- stringDist(DNAStringSet(aln))
myclust <- hclust(mydist)
plot(myclust)
myphylo <- as.phylo(myclust)
ggtree(myphylo) +
geom_tiplab(color="firebrick")
ggtree(myphylo, layout="circular") +
geom_tiplab(color="firebrick")
- DECIPHER 버전업 되면서 바뀐부분 체크
aln <- AlignSeqs(mycovidsubseq)
#mydist <- DistanceMatrix(aln)
# myclust <- IdClusters(aln, cutoff=10, method="NJ", type="dendrogram")
# class(mydist)
- as.tibbe
- 여러 그림 그리기
mytree <- rtree(30)
class(mytree)
p <- ggtree(mytree) +
geom_tiplab()
p
a <- runif(30, 0, 1)
b <- 1 - a
df <- data.frame(mytree$tip.label, a, b)
df2 <- pivot_longer(df, -mytree.tip.label)
p2 <- p + geom_facet(panel="bar",
data=df2,
geom=geom_bar,
mapping = aes(x = value, fill = as.factor(name)),
orientation = 'y',
width=0.8,
stat = 'identity') +
xlim_tree(9)
facet_widths(p2, widths = c(1, 2))
- Genome 분석
-
NC_045512.2
NCBI 다운로드
library(Biostrings)
library(rentrez)
tmps <- entrez_fetch(db="nuccore", id = "NC_045512.2", rettype = "fasta")
tmps
write.table(tmps, "covid19yu.fasta", quote = F, row.names = F, col.names = F)
covid19seq <- readDNAStringSet("covid19yu.fasta")
- read genbank
require(genbankr)
tmps <- entrez_fetch(db="nuccore", id = "NC_045512.2", rettype = "gb")
tmps
write.table(tmps, "covid19yu.gb", quote = F, row.names = F, col.names = F)
covidgb <- readGenBank("covid19yu.gb")
covidgb
cds(covidgb)
class(covidgb)
methods(class="GenBankRecord")
covidseq <- getSeq(covidgb)
class(covidgb)
cdsseq <- getSeq(covidseq, cds(covidgb))
class(covidseq)
class 03
library(IRanges)
ir <- IRanges(start = 1, end = 10)
ir
ir <- IRanges(start = 1:10, width = 10:1)
ir
class(ir)
x <- "GATTGCCCCCCTAG"
y <- unlist(strsplit(x, split=""))
z <- Rle(y)
write.table(z, "tmp.txt")
cds(covidgb)
library(ggbio)
ir <- ranges(cds(covidgb))
ir
start(ir)
end(ir)
width(ir)
autoplot(ir) +
theme_bw()
autoplot(ir, aes(fill=width)) +
theme_bw()
ir <- IRanges(c(1, 10, 100), end=c(20, 50, 110))
autoplot(ir)
disjointBins(ir)
autoplot(disjoin(ir))
autoplot(IRanges::reduce(ir))
- ggbio 설치
library(GenomicRanges)
cds(covidgb)
gr <- GRanges(seqnames = "chr1",
ranges = IRanges(1:10, 21:30),
strand = "+")
gr
gr <- GRanges(seqnames = "chr1",
ranges = IRanges(1:10, 21:30),
strand = "+",
names = paste0("g", 1:10),
score = 1:10)
gr
seqnames(gr)
ranges(gr)
strand(gr)
start(gr)
end(gr)
granges(gr)
mcols(gr)
seqlengths(gr) <- 50
autoplot(gr)
gr2 <- GRanges(seqnames = "chr2",
ranges = IRanges(1:10, 21:30),
strand = "+",
names = paste0("g", 11:20),
score = 1:10)
gr3 <- c(gr, gr2)
seqnames(gr3)
seqlengths(gr3)
split(gr3, c(rep("a", 10), rep("b", 10)))
- disjoin
- reduce
class 04
- 대장균 지놈 plot
tmps <- entrez_fetch("nuccore", id="NC_000913.3", rettype="gbwithparts")
write(tmps, "ecoli-mg1655.gb")
ecoligb <- readGenBank("ecoli-mg1655.gb")
ecoli_cds <- cds(ecoligb)
autoplot(ecoli_cds)
seqlengths(ecoli_cds)
ggbio() +
circle(ecoli_cds, geom="ideo", fill = "gray70") +
circle(ecoli_cds, geom="scale", size=3) +
circle(ecoli_cds, geom = "text", aes(label = locus_tag), vjust = 0, size = 3) +
theme(
axis.text.x = element_text(angle=90)
)
gr1 <- granges(ecoli_cds)
gr2 <- granges(ecoli_cds)
mcols(gr2)$test <- rnorm(length(ecoli_cds))
ggplot() +
layout_circle(ecoli_cds, geom = "ideo", fill = "gray70", radius = 9, trackWidth = 1) +
layout_circle(ecoli_cds, geom = "scale", size = 3, trackWidth = 1, scale.n=20) +
layout_circle(gr1, geom = "rect", color = "steelblue", radius = 5) +
layout_circle(gr2, geom = "bar", aes(y=test), size = 3, trackWidth = 1, scale.n=20, radius = 4)
gr1disj <- disjoin(gr1)
ggplot() +
layout_circle(ecoli_cds, geom = "ideo", fill = "gray70", radius = 9, trackWidth = 1) +
layout_circle(ecoli_cds, geom = "scale", size = 3, trackWidth = 1, scale.n=20) +
layout_circle(gr1disj, geom = "rect", color = "steelblue", radius = 5) +
layout_circle(gr2, geom = "bar", aes(y=test), size = 3, trackWidth = 1, scale.n=20, radius = 4)
library(plyranges)
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("plyranges")
covidseq <- getSeq(covidgb)
cdsrng <- cds(covidgb)
cdsseq <- getSeq(covidseq, cdsrng)
gcratio <- rowSums(letterFrequency(cdsseq, c("G", "C"))) / nchar(cdsseq)
cdsrng %>%
dplyr::select(gene, product) %>%
mutate(gcratio)
- gggene install –> CRAN
- Rstudio > Packages > install
library(gggenes)
mydf <- data.frame(seqname = seqnames(cdsrng),
start = start(cdsrng),
end = end(cdsrng),
strand = case_when(
as.vector(strand(cdsrng))=="+"~ TRUE,
as.vector(strand(cdsrng))=="-"~ FALSE
),
gene = mcols(cdsrng)$gene)
ggplot(mydf, aes(xmin = start,
xmax = end,
y = seqname,
label = gene,
fill = gene,
forward = strand)) +
geom_gene_arrow(arrowhead_height = grid::unit(12, "mm"),
arrowhead_width = grid::unit(6, "mm"),
arrow_body_height = grid::unit(12, "mm")) +
geom_gene_label() +
theme(legend.position = "none")
19.9 Lecture 10 (0915)
class 01
명령창에서 fasterq-dump 실행시 에러가 발생할 경우 아래 실행
vdb-config --interactive
Enable Remote Access 가 (X)로 선택되어 있으면 save 후 exit. tab 키로만 움직일 수 있음. Space 키가 선택 (ok).
- Cmd 창에서 dir 입력하면 파일 리스트가 보임
prefetch --
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("GEOquery")
- GDS
library(GEOquery)
.libPaths()
gds <- getGEO(filename = system.file("extdata/GDS507.soft.gz",package="GEOquery"))
gds
class(gds)
methods(class="GDS")
Table(gds)
Columns(gds)
- GSM
gsm <- getGEO(filename=system.file("extdata/GSM11805.txt.gz",package="GEOquery"))
methods(class=class(gsm))
head(Meta(gsm))
Table(gsm)
Columns(gsm)
- GPL
gpl <- getGEO(filename=system.file("extdata/GPL97.annot.gz",package="GEOquery"))
class(gpl)
methods(class="GPL")
Table(gpl)
- GES class
gse <- getGEO(filename=system.file("extdata/GSE781_family.soft.gz",package="GEOquery"))
methods(class=class(gse))
Meta(gse)
head(GSMList(gse))
gsm <- GSMList(gse)[[1]]
Meta(gsm)
Table(gsm)
Columns(gsm)
class 02
- ExpressionSet
myexp <- Table(gds)
mysample <- Columns(gds)
mygene <- Table(gpl)
myeset <- list()
myeset$exp <- myexp
myeset$sample <- mysample
myeset$gene <- mygene
myeset
- Download GSE
gse2553 <- getGEO('GSE2553', GSEMatrix=TRUE)
gse2553
class(gse2553)
class(gse2553[[1]])
myeset <- gse2553[[1]]
exprs(myeset)
fData(myeset)
pData(myeset)
gds <- getGEO(filename=system.file("extdata/GDS507.soft.gz",package="GEOquery"))
class(gds)
methods(class="GDS")
GPL(gds)
eset <- GDS2eSet(gds, do.log2=TRUE)
eset
fData(eset)
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("SummarizedExperiment")
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("airway")
library(airway)
data(airway)
airway
myexp <- assay(airway)
dim(myexp)
myfeature <- rowRanges(airway)
length(myfeature)
myfeature[[1]]
mysample <- colData(airway)
mysample
- SummarizedExpriment 생성
nrows <- 200
ncols <- 6
counts <- matrix(runif(nrows * ncols, 1, 1e4), nrows)
dim(counts)
rowRanges <- GRanges(rep(c("chr1", "chr2"), c(50, 150)),
IRanges(floor(runif(200, 1e5, 1e6)), width=100),
strand=sample(c("+", "-"), 200, TRUE),
feature_id=sprintf("ID%03d", 1:200))
colData <- DataFrame(Treatment=rep(c("ChIP", "Input"), 3),
row.names=LETTERS[1:6])
se <- SummarizedExperiment(assays = list(counts = counts),
rowRanges = rowRanges,
colData = colData
)
se
class 03
- 표준정규분포 모수:
library(tidyverse)
x <- seq(-4, 4, 0.01)
y <- dnorm(x, 0, 1)
?dnorm
dat <- data.frame(x, y)
ggplot(dat, aes(x=x, y=y)) +
geom_point()
y2 <- dnorm(x, 0, 0.5)
dat <- data.frame(x, y, y2)
dat %>%
pivot_longer(-x) %>%
ggplot(aes(x=x, y=value, color=name)) +
geom_point()
nsample <- 16
x <- rnorm(nsample, 0, 1)
y <- rep(1, nsample)
hist(x)
xbar <- mean(x)
dat <- data.frame(x, y)
ggplot(dat, aes(x, y)) +
geom_point()+
geom_point(aes(x=mean(x), y=1), color="blue", size=5, shape=15)
nsample <- 16
x <- rnorm(nsample*2, 0, 1)
y <- c(rep(1, nsample), rep(0.9, nsample))
g <- factor(c(rep(1, nsample), rep(2, nsample)))
dat <- data.frame(x, y, g)
ggplot(dat, aes(x, y)) +
geom_point() +
geom_point(aes(x=mean(x[1:nsample]), y=1), colour="blue", size=5, shape=15) +
geom_point(aes(x=mean(x[(nsample+1):length(x)]), y=0.9), colour="blue", size=5, shape=15) +
scale_y_continuous(limits=c(0, 1.2))
nsample <- 16
nrep <- 10
x <- rnorm(nsample*nrep, 0, 1)
tmpy <- seq(0.1, 1, length.out=nrep)
y <- rep(tmpy, each=nsample)
## ?rep
g <- factor(y)
dat <- data.frame(x, y, g)
head(dat)
## sample means
dat_mean <- dat %>%
group_by(g) %>%
summarise(mean=mean(x))
head(dat_mean)
ggplot() +
geom_point(data=dat, aes(x, y)) +
geom_point(data=dat_mean,
aes(x=mean, y=as.numeric(as.character(g))),
colour="blue",
size=5,
shape=15) +
theme_bw()
- 1000개 샘플링 해서 평균 값을 모집단의 평균값이다 라고 하는 것 보다 100개 샘플링 후 평균을 10번 반복 후 (샘플의) 평균의 분포를 가지고 모집단의 평균을 말하는 것이 더 정확하다
head(dat)
head(dat_mean)
x <- seq(-4, 4, by=0.01)
# distribution of the samples
y <- dnorm(x, mean(dat$x), sd(dat$x))
# distribution of the sample means
y2 <- dnorm(x, mean(dat_mean$mean), sd(dat_mean$mean))
dat2 <- data.frame(x, y, y2)
dat2_long <- dat2 %>%
pivot_longer(cols=c(y, y2))
head(dat2_long)
ggplot(dat2_long, aes(x=x, y=value, color=name)) +
geom_line(size=1.2) +
labs(y="Density") +
theme_bw() +
scale_color_manual(name="Type",
labels=c("Samples", "Sample Means"),
values=c("red", "blue"))
nsample <- 100
x <- rnorm(nsample, 100, 16)
hist(x)
z <- (90 - 100)/(10/sqrt(10))
z
x <- seq(-4, 4, 0.01)
y <- dnorm(x, 0, 1)
dat <- data.frame(x, y)
ggplot(dat, aes(x=x, y=y)) +
geom_point() +
geom_point(aes(x=z, y=0), size=5, color="blue")
class 04
- 모집단의 분포를 가정을 하고 샘플링을 통해서 통계량을 구한 후 해당 통계량이 가정한 모집단 분포의 어디에 위치하는지를 보고 유의성 판단
x1 <- seq(-6, 6, by=0.01)
y1 <- dnorm(x1, 0, 1)
z <- data.frame(x1, y1)
pval <- round(1-pnorm(2,0,1), 4)
ggplot(z) +
geom_line(aes(x1, y1), color="purple") +
geom_vline(xintercept = 2) +
geom_hline(yintercept = 0) +
geom_area(data=filter(z, x1 > 2),
aes(x1, y1),
fill="#80008055") +
annotate("text", x=3, y=0.1, label=pval)
x1 <- seq(-6, 6, by=0.01)
y1 <- dnorm(x1, 0, 1)
x2 <- seq(-1, 11, by=0.01)
y2 <- dnorm(x2, 3, 1)
z <- data.frame(x1, y1, x2, y2)
#pval <- round(1-pnorm(2,0,1), 4)
ggplot(z) +
geom_line(aes(x1, y1), color="blue") +
geom_line(aes(x2, y2), color="red") +
geom_vline(xintercept = 2) +
geom_hline(yintercept = 0) +
geom_area(data=filter(z, x1 > 2),
aes(x1, y1),
fill="#0000ff55") +
geom_area(data=filter(z, x2 < 2),
aes(x2, y2),
fill="#ff000055") +
annotate("text", x=3, y=0.1, label=pval)
- example
mpg <- c(11.4, 13.1, 14.7, 14.7, 15, 15.5, 15.6, 15.9, 16, 16.8)
tstat <- (mean(mpg)-17)/(sd(mpg)/sqrt(length(mpg)))
tstat
x <- seq(-5, 5, length=100)
y <- dt(x, df=length(mpg)-1)
dat <- data.frame(x, y)
ggplot(dat, aes(x, y)) +
geom_line() +
geom_vline(xintercept = tstat)
pt(tstat, df=9)
- DEG (lecture note 06, class 03) 분석 예제
- 아래 예제는 코드를 보충해서 본문으로 옮겼습니다
gds <- getGEO(filename = system.file("extdata/GDS507.soft.gz",package="GEOquery"))
gds
eset <- GDS2eSet(gds, do.log2=TRUE)
eset
myexp <- data.frame(exprs(eset))
myfeature <- fData(eset)
mypheno <- pData(eset)
table(mypheno$disease.state)
class(myexp)
## transformation
mydat1 <- myexp %>%
rownames_to_column() %>%
pivot_longer(cols = -rowname) %>%
pivot_wider(names_from = rowname, values_from = value)
mydat2 <- mypheno %>%
dplyr::select(sample, disease.state) %>%
left_join(mydat1, by = c("sample" = "name"))
mymean <- mydat2 %>%
group_by(disease.state) %>%
summarise(across(where(is.numeric), mean))
mymean2 <- mymean %>%
pivot_longer(-disease.state) %>%
pivot_wider(names_from=disease.state, values_from = value)
ggplot(mymean2, aes(x=normal, y=RCC)) +
geom_point()
19.10 Lecture 11 (1006)
class 01
- install Rfastp
- working 디렉토리에 examples 디렉토리 생성
- SRR11549087 예제 데이터는 다음 스크립트로 다운로드
fastq-dump -X 10000 --split-files SRR11549076
library(Rfastp)
download.file(url = "https://github.com/greendaygh/kribbr2022/raw/main/fastq/SRR11549087_1.fastq", destfile = "examples/SRR11549087_1.fastq")
fqfiles <- dir(path = "examples", pattern = "*.fastq")
fqfiles
"examples/SRR11549087_1.fastq"
example_dir <- "examples"
file.path(example_dir, "SRR11549087_1.fastq")
fastq_report <- rfastp(read1 = file.path(example_dir, "SRR11549087_1.fastq"),
outputFastq = file.path(example_dir, "filtered_SRR11549087_1.fastq"))
?rfastp
- 탐색기로 working direcotry 이동
- 주소창에 cmd 입력
library(GEOquery)
gds <- getGEO(filename=system.file("extdata/GDS507.soft.gz",package="GEOquery"))
methods(class=class(gds))
gsm <- getGEO(filename=system.file("extdata/GSM11805.txt.gz",package="GEOquery"))
class(gsm)
gsm
methods(class=class(gsm))
Table(gsm)
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("airway")
library(airway)
data(airway)
class(airway)
aw <- assay(airway)
str(aw)
methods(class=class(aw))
rowRanges(airway)
?rowRanges
colData(airway)
metadata(airway)
class 02
- airway data DEG 분석
library(tidyverse)
aw_assay <- data.frame(assay(airway)[1:1000,])
aw_feature <- rowRanges(airway)[1:1000,]
aw_sample <- colData(airway)
aw_assay_t <- aw_assay %>%
rownames_to_column() %>%
pivot_longer(cols = -rowname) %>%
pivot_wider(names_from = rowname, values_from = value)
aw_assay2 <- aw_assay_t %>%
mutate(grp = aw_sample$dex) %>%
dplyr::select(name, grp, everything())
aw_assay3 <- aw_sample %>%
as.data.frame() %>%
dplyr::select(dex) %>%
rownames_to_column() %>%
left_join(aw_assay_t, by=c("rowname" = "name"))
- plot
aw_assay_mean <- aw_assay2 %>%
group_by(grp) %>%
summarise(across(where(is.numeric), mean))
mydata <- aw_assay_mean %>%
pivot_longer(cols=-grp)
ggplot(mydata, aes(x=name, y=value, color=grp)) +
geom_point()
mydata2 <- aw_assay_mean %>%
pivot_longer(cols=-grp) %>%
pivot_wider(names_from = grp, values_from = value)
ggplot(mydata2, aes(x=trt, y=untrt)) +
geom_point() +
scale_x_log10() +
scale_y_log10()
- t-test, t.test 함수 사용
- 아래 코드는 수정 필요 (기존 ExpressionSet 예제 포함)
grp <- aw_assay3$dex
myttest <- function(x){
z <- t.test(x[grp=="untrt"], x[grp=="trt"])
z$p.value
}
tmp <- t.test(1:10, 21:30)
names(tmp)
testpval <- aw_assay3 %>%
summarise(across(where(is.numeric), myttest))
class 03
- reference genome 생성
library(genbankr)
library(Biostrings)
download.file(url="https://github.com/greendaygh/kribbr2022/raw/main/ecoli-mg1655.gb", destfile="examples/ecoli-mg1655.gb")
# ecoli <- readGenBank("examples/ecoli-mg1655.gb")
mydir <- "examples"
ecoli <- readGenBank(file.path(mydir, "ecoli-mg1655.gb"))
ecoliseq <- getSeq(ecoli)
ecolisub <- subseq(ecoliseq, 1, 10000)
writeXStringSet(ecolisub, file.path(mydir, "ecolisub.fasta"))
- Rsubread 패키지 사용해서 mapping
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("Rsubread")
library(Rsubread)
buildindex(basename = file.path(mydir, "example"),
reference = file.path(mydir, "ecolisub.fasta"))
- mapping
myaln <- align(index = file.path(mydir, "example"),
readfile1 = file.path(mydir, "filtered_SRR11549087_1.fastq_R1.fastq.gz"),
output_file = file.path(mydir, "ecoliexample.BAM"),
nthreads = 5)
myaln
- sorting
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("Rsamtools")
library(Rsamtools)
sortBam(file = file.path(mydir, "ecoliexample.BAM"),
destination = file.path(mydir, "sorted_ecoliexample.BAM"))
indexBam(files = file.path(mydir, "sorted_ecoliexample.BAM.bam"))
genome browser (IGV, UCSC Genome Browser)
IGV 설치
Reference genome, sorted bam/bai file
Counting, GenomicAlignments::summarizeOverlaps
counting을 위해서는 bam 파일과 annotation 파일이 필요
library(GenomicAlignments)
library(plyranges)
## ecoli genbank information
class(ecoli)
methods(class="GenBankRecord")
ecolicds <- cds(ecoli)
class(ecolicds)
ecolicdssub <- ecolicds %>%
plyranges::filter(end < 10000)
seqlengths(ecolicdssub) <- 10000
seqlengths(ecolicdssub)
?summarizeOverlaps
mybam <- BamFile(file.path(mydir, "sorted_ecoliexample.BAM.bam"))
mycnt <- summarizeOverlaps(features = ecolicdssub,
reads = mybam)
class(mycnt)
assay(mycnt)
rowRanges(mycnt)
colData(mycnt)
19.11 Lecture 12 (1020)
Class 01
참고 https://greendaygh.github.io/kribbr2022/lecture-note.html#lecture-07-0804
Reference genome
examples 디렉토리 생성
library(genbankr)
library(Biostrings)
download.file(url="http://github.com/greendaygh/kribbr2022/raw/main/ecoli-mg1655.gb", destfile="examples/ecoli-mg1655.gb")
ecoli <- readGenBank(file.path("examples", "ecoli-mg1655.gb"))
ecoliseq <- getSeq(ecoli)
maxlen <- 1000000
ecoliseqsub <- subseq(x = ecoliseq, start = 1, end = maxlen)
writeXStringSet(x = ecoliseqsub,
filepath = "examples/ex1/ecolisubseq.fasta")
?writeXStringSet
- generation of index
library(Rsubread)
buildindex(basename = file.path("examples", "ex1", "ecolisingle"),
reference = file.path("examples", "ex1", "ecolisubseq.fasta")
)
- NGS fastq 파일 다운로드
- 파일 탐색 오픈
-
D:\Rstudy\lecture12\examples\ex1
이동 - 주소창에서 cmd 입력
- 다음 명령 실행
- SRA DB https://greendaygh.github.io/kribbr2022/high-throughput-data.html#ngs-database
fastq-dump -X 10000 --split-files SRR11549076
- QC
- 생성된 html 파일 확인
library(Rfastp)
myreport <- rfastp(read1 = "examples/ex1/SRR11549076_1.fastq",
outputFastq = "examples/ex1/filterd_SRR11549076_1")
- mapping to reference
library(Rsubread)
myaln <- align(index = file.path("examples", "ex1", "ecolisingle"),
readfile1 = file.path("examples", "ex1", "filterd_SRR11549076_1_R1.fastq.gz"))
- Sorting
library(Rsamtools)
sortBam(file = file.path("examples", "ex1", "filterd_SRR11549076_1_R1.fastq.gz.subread.BAM"),
destination = file.path("examples", "ex1", "sorted_ecolisingle"))
indexBam(file = file.path("examples", "ex1", "sorted_ecolisingle.bam"))
- Counting
library(GenomicAlignments)
library(plyranges)
library(tidyverse)
ecolisub <- genbankr::cds(ecoli) %>%
plyranges::filter(end < maxlen)
class(ecolisub)
seqlengths(ecolisub) <- maxlen
mybam <- BamFile(file = file.path("examples", "ex1", "sorted_ecolisingle.bam"))
class(mybam)
mycnt <- summarizeOverlaps(features = ecolisub,
reads = mybam)
class(mycnt)
assay(mycnt)
colData(mycnt)
rowRanges(mycnt)
metadata(mycnt)
Class 02
- SRA software 설치 https://greendaygh.github.io/kribbr2022/high-throughput-data.html#ngs-database
- ex2 디렉토리 만들고
- 파일탐색기 ex2 주소창에 cmd 입력
- raw data 다운로드
prefetch --option-file SRR_Acc_List.txt
- fastq 파일 생성
Fastq-dump -X 200000 --split-files SRR10009019/SRR10009019.sra
- ex2 디렉토리에서
fastq-dump -X 200000 --split-files SRR10009019
fastq-dump -X 200000 --split-files SRR10009020
fastq-dump -X 200000 --split-files SRR10009021
fastq-dump -X 200000 --split-files SRR10009022
fastq-dump -X 200000 --split-files SRR10009023
fastq-dump -X 200000 --split-files SRR10009024
- QC
library(Rfastp)
for(i in c(19:24)){
fastq_report <- rfastp(
read1 = file.path("examples", "ex2", paste0("SRR100090", i, "_1.fastq")),
read2 = file.path("examples", "ex2", paste0("SRR100090", i, "_2.fastq")),
outputFastq = file.path("examples", "ex2", paste0("filtered_SRR100090", i, ".fastq"))
)
}
- 앞에서 만든 ecolisub 파일 ex2 디렉토리에 붙여넣기
library(genbankr)
library(Biostrings)
download.file(url="https://github.com/greendaygh/kribbr2022/raw/main/ecoli-mg1655.gb", destfile="examples/ecoli-mg1655.gb")
ecoli <- readGenBank("examples/ecoli-mg1655.gb")
ecoliseq <- getSeq(ecoli)
maxlen <- 1000000
ecoliseqsub <- subseq(ecoliseq, 1, maxlen)
writeXStringSet(ecoliseqsub, "examples/ecoli/ecolisub.fasta")
- index 생성
buildindex(basename = file.path("examples", "ex2", "ecolimedia"),
reference = file.path("examples", "ex2", "ecolisubseq.fasta"))
- mapping
library(Rsubread)
for(i in c(19:24)){
alignstat <- align(
file.path("examples", "ex2", "ecolimedia"),
readfile1 = file.path("examples", "ex2", paste0("filtered_SRR100090", i, ".fastq_R1.fastq.gz")),
readfile2 = file.path("examples", "ex2", paste0("filtered_SRR100090", i, ".fastq_R2.fastq.gz")),
output_file = file.path("examples", "ex2", paste0("ecolimedia_", i, ".BAM")),
nthreads = 6)
}
- sorting
library(Rsamtools)
sortBam(file = file.path("examples", "ex2", "ecolimedia_19.BAM")
, destination = file.path("examples", "ex2", "sorted_ecolimedia_19.BAM"))
sortBam(file = file.path("examples", "ex2", "ecolimedia_20.BAM")
, destination = file.path("examples", "ex2", "sorted_ecolimedia_20.BAM"))
sortBam(file = file.path("examples", "ex2", "ecolimedia_21.BAM")
, destination = file.path("examples", "ex2", "sorted_ecolimedia_21.BAM"))
sortBam(file = file.path("examples", "ex2", "ecolimedia_22.BAM")
, destination = file.path("examples", "ex2", "sorted_ecolimedia_22.BAM"))
sortBam(file = file.path("examples", "ex2", "ecolimedia_23.BAM")
, destination = file.path("examples", "ex2", "sorted_ecolimedia_23.BAM"))
sortBam(file = file.path("examples", "ex2", "ecolimedia_24.BAM")
, destination = file.path("examples", "ex2", "sorted_ecolimedia_24.BAM"))
indexBam(files = file.path("examples", "ex2", "sorted_ecolimedia_19.BAM.bam"))
indexBam(files = file.path("examples", "ex2", "sorted_ecolimedia_20.BAM.bam"))
indexBam(files = file.path("examples", "ex2", "sorted_ecolimedia_21.BAM.bam"))
indexBam(files = file.path("examples", "ex2", "sorted_ecolimedia_22.BAM.bam"))
indexBam(files = file.path("examples", "ex2", "sorted_ecolimedia_23.BAM.bam"))
indexBam(files = file.path("examples", "ex2", "sorted_ecolimedia_24.BAM.bam"))
- Counting
ecolicds <- cds(ecoli)
ecolicds_sub <- ecolicds %>%
filter(end < maxlen)
seqlengths(ecolicds_sub) <- maxlen
filelist <- c(file.path("examples", "ex2", "sorted_ecolimedia_19.BAM.bam"),
file.path("examples", "ex2", "sorted_ecolimedia_20.BAM.bam"),
file.path("examples", "ex2", "sorted_ecolimedia_21.BAM.bam"),
file.path("examples", "ex2", "sorted_ecolimedia_22.BAM.bam"),
file.path("examples", "ex2", "sorted_ecolimedia_23.BAM.bam"),
file.path("examples", "ex2", "sorted_ecolimedia_24.BAM.bam")
)
mybam <- BamFileList(filelist)
myresult <- summarizeOverlaps(ecolicds_sub, mybam, ignore.strand = T)
assay(myresult)
rowData(myresult)
colData(myresult)
metadata(myresult)
- DESeq2
- mean vs variance
mydatat <- mydata %>%
data.frame() %>%
rownames_to_column() %>%
pivot_longer(-rowname) %>%
pivot_wider(names_from = rowname, values_from = value)
mymeanval <- mydatat %>%
summarise(across(where(is.numeric), mean))
mysdval <- mydatat %>%
summarise(across(where(is.numeric), sd))
mydf <- mymeanval %>%
bind_rows(mysdval) %>%
as.data.frame
rownames(mydf) <- c("mean", "sd")
mydft <- mydf %>%
rownames_to_column() %>%
pivot_longer(-rowname) %>%
pivot_wider(names_from = rowname, values_from = value)
ggplot(mydft, aes(x=mean, y=sd)) +
geom_point() +
scale_x_log10() +
scale_y_log10()
- DESeq2
library(DESeq2)
metaData <- data.frame(Group = c("LB", "LB", "LB", "M63", "M63", "M63"),
row.names = colnames(mydata))
metaData
#mygenes <- rowData(myresult)
dds <- DESeqDataSetFromMatrix(countData = mydata,
colData = metaData,
design = ~Group,
rowRanges = mygenes
)
colData(myresult)$Group <- metaData$Group
colData(myresult)
dds <- DESeqDataSetFromMatrix(myresult)