!pwd
3 Python Basics
3.1 구글 드라이브 연결
- 세션이 다시 시작 될 때에도 실행 필요
from google.colab import drive
import os
'/content/drive')
drive.mount('drive/MyDrive/2024-cnu-lecture') os.chdir(
!pwd
3.2 [User]
Okay, I’ll learn “1. Basic Python Programming”. Can you teach me with a brief explanation and examples
3.3 [ChatGPT]
Absolutely! Let’s start with the basics of Python programming. I’ll cover fundamental concepts with brief explanations and examples to get you started.
3.4 Python Syntax and Semantics
3.4.1 Hello World
The simplest Python program:
print("Hello, World!")
3.5 Variables and Data Types
Python supports various data types including integers, floats, strings, and booleans.
# Integer
= 10
a
# Float
= 3.14
b
# String
= "Hello, Python"
c
# Boolean
= True
d
print(a, b, c, d)
10 3.14 Hello, Python True
3.6 Control Structures
3.6.1 If-Else Statements
Used for decision making.
= 10
x if x > 5:
print("x is greater than 5")
else:
print("x is 5 or less")
x is greater than 5
3.6.2 Loops
For repeating tasks.
for i in range(5):
print(i)
0
1
2
3
4
= 0
i while i < 5:
print(i)
+= 1 i
0
1
2
3
4
3.6.3 Functions
3.6.3.1 Defining Functions
Reusable blocks of code.
def greet(name):
return f"Hello, {name}"
print(greet("Alice"))
Hello, Alice
= ["Alice", "Bob", "Charlie"]
names
for name in names:
print ("My name is ", name)
My name is Alice
My name is Bob
My name is Charlie
- 리스트 값 평균 리턴하는 함수
def average(input):
if len(input) == 0:
return None
return sum(input) / len(input)
= [1,2,3,4,5,6,7,8,9,10]
x print(average(x))
5.5
- 함수에 대한 개념 이해 중요
- 실습과 예제를 통해 이해 필요 (마지막 예제에서 실습)
3.6.3.2 f-string
- f와
{}
조합으로 표현할 수 있으며 출력이 고정된 문자열 맨 앞에 f를 붙여주고, 출력하고 싶은 변수값 등은 중괄호로 표현
= "Alice"
name = f"My name is {name}"
mystr
print(mystr)
My name is Alice
= ["Alice", "Alisa", "Alister"]
names = f"My name is {names}"
mystr
print(mystr)
My name is ['Alice', 'Alisa', 'Alister']
= ["Alice", "Alisa", "Alister"]
names
for name in names:
= f"My name is {name}"
mystr print(mystr)
My name is Alice
My name is Alisa
My name is Alister
3.7 Lists and Dictionaries
- 리스트나 딕셔너리는 파이썬에서 데이터를 저장하는 변수의 자료형임
- 여러 종류의 데이터를 효율적으로 활용하기 위한 자료 구조임
3.7.1 Lists (리스트)
- Ordered, mutable collections.
= ["apple", "banana", "cherry"]
fruits print(fruits[0]) # Accessing elements
"date") # Adding an element
fruits.append(print(fruits)
apple
['apple', 'banana', 'cherry', 'date']
- 인덱싱은 값 자체 (1은 두 번째값)
- 슬라이싱은 값 사이 경계선 (1은 첫 번째 값과 두 번째 값 사이)
- 아래 그림과 여러 실습 예제를 통한 이해 필요
= [x for x in range(10)] # 리스트 컴프리헨션
geneids print(geneids)
print(geneids[0])
print(geneids[-1])
print(geneids[2:5])
print(geneids[2:-3])
print(geneids[:])
print(geneids[:-1])
print(geneids[1:])
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
0
9
[2, 3, 4]
[2, 3, 4, 5, 6]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[0, 1, 2, 3, 4, 5, 6, 7, 8]
[1, 2, 3, 4, 5, 6, 7, 8, 9]
- 리스트 데이터 삽입 삭제
= [1, 2, 3]
geneids print(geneids)
4)
geneids.append(print(geneids)
print("length: %d" % len(geneids))
len(geneids):] = [5]
geneids[print(geneids)
print(geneids.pop())
print(geneids)
[1, 2, 3]
[1, 2, 3, 4]
length: 4
[1, 2, 3, 4, 5]
5
[1, 2, 3, 4]
3.7.2 Tuple (튜플)
- 리스트와 같은 기능이지만 ‘(’, ’)’를 사용하고 원소를 변경할 수 없음
- 리스트보다 빠른 속도, 리스트와 동일한 인덱싱 방법
= (1, 2, 3)
geneids print(geneids[0:2])
#geneids[0] = 4 ## error
(1, 2)
- 반복문에서 리스트 또는 튜플 활용
= ['123', '456', '789']
geneids for geneid in geneids:
print(f"geneid: {geneid}")
geneid: 123
geneid: 456
geneid: 789
3.7.3 Dictionaries (딕셔너리)
- Key-value pairs, unordered.
= {"name": "Alice", "age": 25}
person print(person["name"])
"age"] = 26 # Updating value
person[print(person)
Alice
{'name': 'Alice', 'age': 26}
- 키(key)와 값(value)을 쌍으로 저장, ‘{’와’}’를 사용
= {}
gene_expr 'A'] = 0.5
gene_expr[print(gene_expr)
'B'] = 1.2
gene_expr[print(gene_expr)
print(len(gene_expr))
{'A': 0.5}
{'A': 0.5, 'B': 1.2}
2
- 인덱싱은 ‘[’, ’]’ 사용, 키 값으로 인덱싱, 정수값 인덱싱 불가
print(gene_expr['A'])
## gene_expr[0] # error
0.5
- 데이터 추가는 key값 value값으로 수행, 삭제는 del 함수 이용
'C'] = 0.3
gene_expr[print(gene_expr)
del gene_expr['C']
print(gene_expr)
{'A': 0.5, 'B': 1.2, 'C': 0.3}
{'A': 0.5, 'B': 1.2}
- key 값과 value 값 구하기
= list(gene_expr.keys())
gene_expr_keys print("keys:", gene_expr_keys)
= list(gene_expr.values())
gene_expr_values print("values:", gene_expr_values)
keys: ['A', 'B']
values: [0.5, 1.2]
- in 활용 키 값 탐색
print('D' in gene_expr_keys)
print('D' in gene_expr)
print('A' in gene_expr)
False
False
True
- 반복문에서 딕셔너리 활용 items()
= {'A':0.5, 'B':1.2, 'C':0.3, 'D':3.2}
gene_expr for key, val in gene_expr.items():
print(f"{key} expression value is {val}")
#print("%s expression value is %s" %(geneid, expval))
A expression value is 0.5
B expression value is 1.2
C expression value is 0.3
D expression value is 3.2
3.8 Modules and Packages
3.8.1 Importing Modules
Using standard libraries.
import math
print(math.sqrt(16))
4.0
3.8.2 Importing Specific Functions
from math import sqrt
print(sqrt(25))
5.0
- 위 average 함수를 mystat.py 라는 이름의 파일로 저장, 모듈로 활용
#import mystat
#x = list(range(10))
#print(mystat.average(x))
- 모듈 직접 실행시 모듈 내 test 코드 실행 (name == main, True)
#%run mystat
- 모듈 임포트
import os
os.getcwd()
'/home/haseong/lecture/cnu-deeplearning-2024'
from os import getcwd
getcwd()
'/home/haseong/lecture/cnu-deeplearning-2024'
3.9 File I/O
3.9.1 Writing to a File
with open('example.txt', 'w') as file:
file.write("Hello, World!")
3.9.2 Reading from a File
with open('example.txt', 'r') as file:
= file.read()
content print(content)
3.9.3 Example: Basic DNA Sequence Manipulation
Let’s put some of these concepts together with a simple example that manipulates a DNA sequence.
# Define a DNA sequence
= "ATGCGTACGTAGCTAGCTAG"
dna_seq
# Function to compute GC content
def gc_content(seq):
= seq.count('G') + seq.count('C')
gc_count return gc_count / len(seq) * 100
# Function to get the reverse complement of the sequence
def reverse_complement(seq):
= {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
complement return ''.join(complement[base] for base in reversed(seq))
# Compute GC content
= gc_content(dna_seq)
gc print(f"GC Content: {gc:.2f}%")
# Get reverse complement
= reverse_complement(dna_seq)
rev_comp print(f"Reverse Complement: {rev_comp}")
GC Content: 50.00%
Reverse Complement: CTAGCTAGCTACGTACGCAT
3.9.4 String join (문자열 붙이기)
= "ATGC"
seq1 = "TAGC"
seq2 = seq1 + seq2
dna_seq
for base in dna_seq:
print(base)
print(type(dna_seq))
A
T
G
C
T
A
G
C
<class 'str'>
= {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
complement
for base in dna_seq:
print(complement [base])
= []
dna_seq_list for base in dna_seq:
dna_seq_list.append(complement[base])
print(dna_seq_list)
print(type(dna_seq_list))
= "".join (dna_seq_list)
dna_seq_comp print(dna_seq_comp)
print(type(dna_seq_comp))
T
A
C
G
A
T
C
G
['T', 'A', 'C', 'G', 'A', 'T', 'C', 'G']
<class 'list'>
TACGATCG
<class 'str'>
= [complement[base] for base in dna_seq]
dna_seq_list print(dna_seq_list)
= "".join(dna_seq_list)
dna_seq_comp print(dna_seq_comp)
print("".join([complement[base] for base in dna_seq]))
print("".join([complement[base] for base in dna_seq if base == "A" or base == "T"]))
['T', 'A', 'C', 'G', 'A', 'T', 'C', 'G']
TACGATCG
TACGATCG
TAAT
3.10 클래스
- Gene, Strain class 생성 연습
- Gene attribute: name, chromosomal location, length
- Strain attribute (변수): name, length of chromosome
- Strain method (함수): compute average length of the genes
import statistics
class ORF:
def __init__(self, location, length, seq):
self.location = location
self.length = length
self.sequence = seq
class Strain:
def __init__(self, name, chrlength):
self.name = name
self.chr_length = chrlength
self.orfs = []
def add_orf(self, location, length, seq):
self.orfs.append(ORF(location, length, seq))
def orf_length_average(self):
return statistics.mean([s.length for s in self.orfs])
= Strain("ecoli", 5000000)
ecoli 1, 1000, "ATG")
ecoli.add_orf(1001, 2000, "CCT")
ecoli.add_orf(2001, 3000, "ATC") ecoli.add_orf(
print([g.location for g in ecoli.orfs])
print([g.sequence for g in ecoli.orfs])
ecoli.orf_length_average()
[1, 1001, 2001]
['ATG', 'CCT', 'ATC']
2000
- 상속
class Gene(ORF):
def add_protein(self, prot_name, prot_seq):
self.prot_name = prot_name
self.prot_sequence = prot_seq
= Gene(1, 1000, "ATG")
gene1 print(gene1.location)
"myprotein", "M")
gene1.add_protein(print(gene1.prot_name)
1
myprotein
3.11 Example: Synonymous Mutant Library Generation
3.11.1 [User]
I want to generate mutant library of a protein. I have a nucleotide genbank file for the target protein. I wan to generate all the mutants by changing synonymous codons for each residue. can you give me the python code?3.11.2 [ChatGPT]
Certainly! To generate a mutant library by changing synonymous codons for each residue of a protein, we’ll need to:
- Read the nucleotide sequence from a GenBank file.
- Translate the nucleotide sequence to a protein sequence.
- Identify synonymous codons for each amino acid.
- Generate all possible synonymous mutants for the target protein.
Here is the Python code that accomplishes these tasks:
- 이 예제는 위 프롬프트로 ChatGPT가 만든 코드입니다
3.11.2.1 Install Necessary Libraries
Make sure you have Biopython installed:
!pip install biopython
3.11.2.2 Python Script to Generate Synonymous Mutant Library
from Bio import SeqIO
from Bio.Seq import Seq
from itertools import product
# Synonymous codon table
= {
synonymous_codons 'F': ['TTT', 'TTC'],
'L': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
'I': ['ATT', 'ATC', 'ATA'],
'M': ['ATG'],
'V': ['GTT', 'GTC', 'GTA', 'GTG'],
'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
'P': ['CCT', 'CCC', 'CCA', 'CCG'],
'T': ['ACT', 'ACC', 'ACA', 'ACG'],
'A': ['GCT', 'GCC', 'GCA', 'GCG'],
'Y': ['TAT', 'TAC'],
'H': ['CAT', 'CAC'],
'Q': ['CAA', 'CAG'],
'N': ['AAT', 'AAC'],
'K': ['AAA', 'AAG'],
'D': ['GAT', 'GAC'],
'E': ['GAA', 'GAG'],
'C': ['TGT', 'TGC'],
'W': ['TGG'],
'R': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
'G': ['GGT', 'GGC', 'GGA', 'GGG'],
'*': ['TAA', 'TAG', 'TGA']
}
# Function to read the GenBank file and extract the coding sequence
def read_cds_from_genbank(file_path):
= SeqIO.read(file_path, "genbank")
record for feature in record.features:
if feature.type == "CDS":
return feature.location.extract(record).seq
raise ValueError("No CDS found in the GenBank file.")
# Function to generate synonymous mutants
def generate_synonymous_mutants(cds_seq):
= cds_seq.translate()
protein_seq = []
codon_options
for aa in protein_seq[:5]:
= synonymous_codons.get(aa, [])
codons if not codons:
raise ValueError(f"Unknown amino acid {aa} encountered.")
codon_options.append(codons)
# Generate all combinations of synonymous codons
= list(product(*codon_options))
all_mutants return [''.join(mutant) for mutant in all_mutants]
# Main function
def main(genbank_file, output_file):
= read_cds_from_genbank(genbank_file)
cds_seq = generate_synonymous_mutants(cds_seq)
mutants
# Write the mutants to a FASTA file
with open(output_file, 'w') as f:
for i, mutant in enumerate(mutants):
f">Mutant_{i+1}\n{mutant}\n")
f.write(
# Example usage
= 'data/egfp.gb' # Replace with your GenBank file path
genbank_file = 'mutant_library.fasta'
output_file main(genbank_file, output_file)
- 위 코드를 두 개의 파일로 나누어 (함수모음 파일, 실행 파일) 모듈 실습 가능