from google.colab import drive
import os
'/content/drive')
drive.mount('drive/MyDrive/2024-cnu-lecture') os.chdir(
4 Data handling
!pwd
4.0.1 [User]
I want to learn “2. Data Handling and Analysis”. can you explain me a brief explanation and examples?
4.0.2 [ChatGPT]
Sure! Data handling and analysis are essential skills in Python, especially for tasks like manipulating datasets, performing statistical analyses, and creating visualizations. We’ll focus on the key libraries used for these purposes: NumPy, Pandas, and Matplotlib/Seaborn.
4.0.3 1. NumPy
NumPy is a powerful library for numerical operations and handling arrays.
4.0.3.1 Basics of NumPy
Installation:
!pip install numpy
import numpy as np
# Creating a 1D array
= np.array([1, 2, 3, 4, 5])
arr1
# Creating a 2D array
= np.array([[1, 2, 3], [4, 5, 6]])
arr2
print(arr1)
print(arr2)
# Element-wise operations
= arr1 * 2
arr3 print(arr3)
# Mathematical functions
print(np.sqrt(arr1))
[1 2 3 4 5]
[[1 2 3]
[4 5 6]]
[ 2 4 6 8 10]
[1. 1.41421356 1.73205081 2. 2.23606798]
4.0.3.2 Numpy 자료구조 ndarray
행렬이나 다차원 배열 처리용 파이썬 라이브러리
같은 타입의 데이터만 허용
리스트에 비해 20배 이상 빠른 속도
numpy 배열의 가시화는 일반적으로 우리가 사용하는 방식과 다름
일반적으로 우리는 차원을 높여가며 그림을 그릴 경우 아래와 같음
- 그런데 numpy에서 차원의 추가는 shape의 앞쪽에 새로운 차원이 추가가 되는 형식
- 그림으로 그릴 경우 아래와 같이 axis 0, 1, 2 순서로 해석될 수 있음
import numpy as np
4))
display(np.ones(3, 4)))
display(np.ones((2, 3, 4))) display(np.ones((
array([1., 1., 1., 1.])
array([[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.]])
array([[[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.]],
[[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.]]])
- 위와 같이 표현되는 것이 맞으나 기존 우리가 알고 있는 가시화 방식과 차이가 있어 혼란 가능
- 또한, 아래 pytorch의 tensor dimension 표현법은 또 다른 방식임
- 따라서 각 dimension의 axis를 구분하는 것 보다는 depth (channel), width, height를 명확하게 구분하는 것이 중요하고 상황에 따라 다르게 가시화 할 수 있도록 훈련 필요
- numpy 객체 만들기
import numpy as np
= [1, 2, 3]
arr print(arr)
print(type(arr))
= np.array([1,2,3])
a print(a)
print(a.dtype)
print(a.shape)
print(type(a))
[1, 2, 3]
<class 'list'>
[1 2 3]
int64
(3,)
<class 'numpy.ndarray'>
= np.array([[1,2,3], [4,5,6]])
arr2 print(arr2)
print(type(arr2))
print(arr2.shape)
print(arr2.dtype)
[[1 2 3]
[4 5 6]]
<class 'numpy.ndarray'>
(2, 3)
int64
- numpy 자료형
- 부호가 있는 정수 int(8, 16, 32, 64)
- 부호가 없는 정수 uint(8 ,16, 32, 54)
- 실수 float(16, 32, 64, 128)
- 복소수 complex(64, 128, 256)
- 불리언 bool
- 문자열 string_
- 파이썬 오프젝트 object
- 유니코드 unicode_
- np.zeros(), np.ones(), np.arange()
- 행렬 연산 지원
= np.arange(1, 10).reshape(3,3) # [1, 10)
a print(a)
= np.ones((3,4), dtype=np.int16)
a = np.ones((3,4), dtype=np.int16)
b print(a)
print(b)
print(a+b)
print(a-b)
[[1 2 3]
[4 5 6]
[7 8 9]]
[[1 1 1 1]
[1 1 1 1]
[1 1 1 1]]
[[1 1 1 1]
[1 1 1 1]
[1 1 1 1]]
[[2 2 2 2]
[2 2 2 2]
[2 2 2 2]]
[[0 0 0 0]
[0 0 0 0]
[0 0 0 0]]
- numpy 함수
- np.sqrt()
- np.log()
- np.square()
- np.log()
- np.ceil()
- np.floor()
- np.isnan()
- np.sum()
- np.mean()
- np.std()
- np.min()
dir(np)
4.0.4 2. Pandas
Pandas is a powerful library for data manipulation and analysis. It provides data structures like DataFrame, which is similar to tables in databases or spreadsheets.
4.0.4.1 Installation
pip install pandas
- Pandas의 Series는 1차원, DataFrame은 2차원 데이터를 다루는 자료구조
- DataFrames are the primary data structure in pandas, representing tabular data with rows and columns.
- index, column, values 로 이루어짐
- 리스트와 딕셔너리로부터 생성 가능
- 숫자형, 문자형, 범주형 등의 다양한 데이터 입력 가능
from pandas import Series, DataFrame
= Series([0.1, 0.2, 1.4, 0.6, 1.1])
genes print(genes)
0 0.1
1 0.2
2 1.4
3 0.6
4 1.1
dtype: float64
= Series([0.1, 0.2, 1.4, 0.6, 1.1], index=['A', 'B', 'C', 'D', 'E'])
genes print(genes)
A 0.1
B 0.2
C 1.4
D 0.6
E 1.1
dtype: float64
4.0.4.2 Make a DataFrame
- Create from a list
import pandas as pd
= pd.DataFrame([[0, 4, 5], [0, 6, 7], [20, 30, 40]],
df =[1, 2, 3], columns=['P', 'Q', 'R'])
index
print(df)
P Q R
1 0 4 5
2 0 6 7
3 20 30 40
- Create dataframe from a dictionary
import pandas as pd
= {
data 'Name': ['Alice', 'Bob', 'Charlie'],
'Age': [25, 30, 35],
'Height': [5.5, 6.0, 5.8]
}
= pd.DataFrame(data)
df print(df)
Name Age Height
0 Alice 25 5.5
1 Bob 30 6.0
2 Charlie 35 5.8
- Create from a list of dictionary
= [
data 'Name': 'Alice', 'Age': 25, 'Height': 5.5},
{'Name': 'Bob', 'Age': 30, 'Height': 6.0},
{'Name': 'Charlie', 'Age': 35, 'Height': 5.8}
{
]
= pd.DataFrame(data)
df print(df)
Name Age Height
0 Alice 25 5.5
1 Bob 30 6.0
2 Charlie 35 5.8
- Create from Series
= {'A': [0.5, 0.1, 0.3],
genes 'B': [0.8, 0.9, 0.4]}
print(genes)
= DataFrame(genes)
genes_df print(genes_df)
print(genes_df['A'])
print(type(genes_df['A']))
= DataFrame(genes, columns=['B', 'A'], index=['day1', 'day2', 'day3'])
genes_df print(genes_df)
{'A': [0.5, 0.1, 0.3], 'B': [0.8, 0.9, 0.4]}
A B
0 0.5 0.8
1 0.1 0.9
2 0.3 0.4
0 0.5
1 0.1
2 0.3
Name: A, dtype: float64
<class 'pandas.core.series.Series'>
B A
day1 0.8 0.5
day2 0.9 0.1
day3 0.4 0.3
4.0.4.3 Dataframe operations
= [
data 'Name': 'Alice', 'Age': 25, 'Height': 5.5},
{'Name': 'Bob', 'Age': 30, 'Height': 6.0},
{'Name': 'Charlie', 'Age': 35, 'Height': 5.8}
{
]
= pd.DataFrame(data)
df print(df)
Name Age Height
0 Alice 25 5.5
1 Bob 30 6.0
2 Charlie 35 5.8
# Selecting columns
= df['Age']
ages print(ages)
# Filtering rows
= df[df['Age'] > 18]
adults print(adults)
# Adding a new column
'Weight'] = [65, 70, 75]
df[print(df)
# Grouping and Aggregating
= df.groupby('Name').mean()
grouped print(grouped)
0 25
1 30
2 35
Name: Age, dtype: int64
Name Age Height
0 Alice 25 5.5
1 Bob 30 6.0
2 Charlie 35 5.8
Name Age Height Weight
0 Alice 25 5.5 65
1 Bob 30 6.0 70
2 Charlie 35 5.8 75
Age Height Weight
Name
Alice 25.0 5.5 65.0
Bob 30.0 6.0 70.0
Charlie 35.0 5.8 75.0
- 인덱스 자동 정렬, 행렬 연산
= Series([0.1, 0.2, 1.4, 0.6, 1.1], index=['A', 'B', 'C', 'D', 'E'])
genes1 = Series([0.1, 0.2, 1.4, 0.6, 1.1], index=['B', 'C', 'D', 'E', 'A'])
genes2 + genes2 genes1
A 1.2
B 0.3
C 1.6
D 2.0
E 1.7
dtype: float64
print(genes2.sort_values())
print(genes2.sort_index())
B 0.1
C 0.2
E 0.6
A 1.1
D 1.4
dtype: float64
A 1.1
B 0.1
C 0.2
D 1.4
E 0.6
dtype: float64
= {'A': [0.5, 0.1, 0.3],
genes 'B': [0.8, 0.9, 0.4]}
= DataFrame(genes, columns=['B', 'A'], index=['day1', 'day2', 'day3'])
genes_df print(genes)
print(genes_df)
{'A': [0.5, 0.1, 0.3], 'B': [0.8, 0.9, 0.4]}
B A
day1 0.8 0.5
day2 0.9 0.1
day3 0.4 0.3
print(genes_df['A'])
print(genes_df.loc['day1'])
print(genes_df.index)
print(list(genes_df.columns))
day1 0.5
day2 0.1
day3 0.3
Name: A, dtype: float64
B 0.8
A 0.5
Name: day1, dtype: float64
Index(['day1', 'day2', 'day3'], dtype='object')
['B', 'A']
4.0.5 3. Matplotlib/Seaborn
Matplotlib is a plotting library for creating static, animated, and interactive visualizations. Seaborn is built on top of Matplotlib and provides a high-level interface for drawing attractive statistical graphics.
Installation:
pip install matplotlib seaborn
import matplotlib.pyplot as plt
# Line plot
1, 2, 3, 4], [1, 4, 9, 16])
plt.plot(['X-axis')
plt.xlabel('Y-axis')
plt.ylabel('Line Plot')
plt.title( plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
= [
data 'Name': 'Alice', 'Age': 25, 'Height': 5.5},
{'Name': 'Bob', 'Age': 30, 'Height': 6.0},
{'Name': 'Charlie', 'Age': 35, 'Height': 5.8}
{
]
= pd.DataFrame(data)
df print(df)
# Scatter plot with Seaborn
='Age', y='Height', data=df)
sns.scatterplot(x'Age vs Height')
plt.title( plt.show()
Name Age Height
0 Alice 25 5.5
1 Bob 30 6.0
2 Charlie 35 5.8
dir(sns)
4.0.6 Example: Analyzing and Visualizing DNA Sequence Data
4.0.6.1 [User]
I have downloaded a fasta file containing 800 esterase nucleotide sequences. the file is saved at “data/esterase.fa”. can you provide a python code that can read the sequences and compute their length and GC contents ratio?
4.0.6.2 [ChatGPT]
Certainly! To read a FASTA file containing 800 esterase nucleotide sequences and compute their lengths and GC content ratios, you can use the Biopython library. Here is a Python script that accomplishes this:
Installation
First, make sure you have Biopython installed:
pip install biopython
from Bio import SeqIO
from Bio.SeqUtils import gc_fraction
# Function to compute sequence length and GC content
def compute_sequence_stats(seq):
= len(seq)
length = gc_fraction(seq) * 100 # Convert to percentage
gc_content return length, gc_content
# Read the FASTA file and process each sequence
def process_fasta_file(file_path):
= SeqIO.parse(file_path, "fasta")
records = []
esterase_stats
for record in records:
= compute_sequence_stats(record.seq)
length, gc_content
esterase_stats.append({"ID": record.id,
"Length": length,
"GC_Content": gc_content
})
return esterase_stats
# Example usage
= 'data/esterases.fasta' # Replace with your FASTA file path
fasta_file = process_fasta_file(fasta_file)
esterase_stats
# Print the results
for esterase in esterase_stats:
print(f"ID: {esterase['ID']}, Length: {esterase['Length']} bp, GC Content: {esterase['GC_Content']:.2f}%")
4.0.6.3 [User]
I want to store the result above in a file with json csv format. Than I need a code to read the file and convert the data in a DataFrame format.
4.0.6.4 [ChatGPT]
Sure! Let’s modify the script to store the results in both JSON and CSV formats. Then, we’ll write a separate script to read these files and convert the data into a Pandas DataFrame.
import json
import csv
from Bio import SeqIO
from Bio.SeqUtils import gc_fraction
# Function to compute sequence length and GC content
def compute_sequence_stats(seq):
= len(seq)
length = gc_fraction(seq) * 100 # Convert to percentage
gc_content return length, gc_content
# Read the FASTA file and process each sequence
def process_fasta_file(file_path):
= SeqIO.parse(file_path, "fasta")
records = []
esterase_stats
for record in records:
= compute_sequence_stats(record.seq)
length, gc_content
esterase_stats.append({"ID": record.id,
"Length": length,
"GC_Content": gc_content
})
return esterase_stats
# Save results to JSON and CSV
def save_results(esterase_stats, json_file, csv_file):
# Save to JSON
with open(json_file, 'w') as jf:
=4)
json.dump(esterase_stats, jf, indent
# Save to CSV
with open(csv_file, 'w', newline='') as cf:
= csv.DictWriter(cf, fieldnames=['ID', 'Length', 'GC_Content'])
writer
writer.writeheader()for esterase in esterase_stats:
writer.writerow(esterase)
# Example usage
= 'data/esterases.fasta' # Replace with your FASTA file path
fasta_file = 'data/esterase_stats.json'
json_file = 'data/esterase_stats.csv'
csv_file
= process_fasta_file(fasta_file)
esterase_stats
save_results(esterase_stats, json_file, csv_file)
print(f"Results saved to {json_file} and {csv_file}")
Results saved to data/esterase_stats.json and data/esterase_stats.csv
import pandas as pd
import json
# Load data from JSON file
with open('data/esterase_stats.json', 'r') as jf:
= json.load(jf)
esterase_stats_json
# Convert to DataFrame
= pd.DataFrame(esterase_stats_json)
df_json print("DataFrame from JSON file:")
print(df_json.head())
# Load data from CSV file
= pd.read_csv('data/esterase_stats.csv')
df_csv print("DataFrame from CSV file:")
print(df_csv.head())
4.0.6.5 [User]
I want a python code that display the statistics of the GC distribution and boxplot of the length.
4.0.6.6 [ChatGPT]
Sure! Let’s create a Python script that reads the data from the previously saved JSON or CSV file, computes statistics for the GC content distribution, and generates a boxplot for the sequence lengths using Pandas and Matplotlib.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load data from CSV file
= pd.read_csv('data/esterase_stats.csv')
df
# Compute statistics for GC content distribution
= df['GC_Content'].describe()
gc_stats print("GC Content Distribution Statistics:")
print(gc_stats)
# Generate a histogram for GC content distribution
=(10, 6))
plt.figure(figsize'GC_Content'], bins=30, kde=True)
sns.histplot(df['GC Content Distribution')
plt.title('GC Content (%)')
plt.xlabel('Frequency')
plt.ylabel(
plt.show()
# Generate a boxplot for sequence lengths
=(10, 6))
plt.figure(figsize=df['Length'])
sns.boxplot(x'Boxplot of Sequence Lengths')
plt.title('Sequence Length (bp)')
plt.xlabel( plt.show()
GC Content Distribution Statistics:
count 355.000000
mean 51.362399
std 5.551809
min 29.739777
25% 50.052780
50% 51.901566
75% 54.322222
max 75.683891
Name: GC_Content, dtype: float64
- Script
from Bio import SeqIO
from Bio.SeqUtils import gc_fraction
# Read a FASTA file
= 'data/esterases.fasta'
fasta_file = SeqIO.parse(fasta_file, 'fasta')
results
for record in results:
print(record.id)
print(record.seq)
print(gc_fraction(record.seq))
print(results)
- Calculate
from Bio import SeqIO
from Bio.SeqUtils import gc_fraction
# Read a FASTA file
= 'data/esterases.fasta'
fasta_file = SeqIO.parse(fasta_file, 'fasta')
results
for record in results:
= record.seq
seq = len(seq)
length = gc_fraction(seq) * 100 # Convert to percentage
gc_content print(f"ID: {record.id}, Length: {length} bp, GC Content: {gc_content:.2f}%")
- store in a list
from Bio import SeqIO
from Bio.SeqUtils import gc_fraction
import pandas as pd
# Read a FASTA file
= 'data/esterases.fasta'
fasta_file = SeqIO.parse(fasta_file, 'fasta')
results = []
esterase_stats
for record in results:
= record.seq
seq = len(seq)
length = gc_fraction(seq) * 100 # Convert to percentage
gc_content
esterase_stats.append({"ID": record.id,
"Length": length,
"GC_Content": gc_content
})
print(esterase_stats)
pd.DataFrame(esterase_stats)
- make a function
from Bio import SeqIO
from Bio.SeqUtils import gc_fraction
import pandas as pd
# Read a FASTA file
= 'data/esterases.fasta'
fasta_file = SeqIO.parse(fasta_file, 'fasta')
results = []
esterase_stats
# Function to compute sequence length and GC content
def compute_sequence_stats(seq):
= len(seq)
length = gc_fraction(seq) * 100 # Convert to percentage
gc_content return length, gc_content
for record in results:
= record.seq
seq = compute_sequence_stats(seq)
length, gc_content
esterase_stats.append({"ID": record.id,
"Length": length,
"GC_Content": gc_content
})
print(esterase_stats)
pd.DataFrame(esterase_stats)