I'll help you process the GSE154881 raw counts file. This appears to be RNA-seq data from GEO database.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Load the data
df = pd.read_csv('GSE154881_raw_counts_GRCh38.p13_NCBI.tsv', sep='\\\\t', index_col=0)
print("Data shape:", df.shape)
print("\\\\nFirst few rows:")
print(df.head())
print("\\\\nColumn names:")
print(df.columns.tolist())
# Remove version numbers from gene IDs if present
df.index = df.index.str.split('.').str[0]
# Check for missing values
print("Missing values:", df.isnull().sum().sum())
# Remove genes with zero counts across all samples
df = df[(df > 0).any(axis=1)]
print("Shape after removing zero-count genes:", df.shape)
# Basic statistics
print("\\\\nSummary statistics:")
print(df.describe())
# Calculate library sizes
library_sizes = df.sum(axis=0)
print("Library sizes:")
print(library_sizes)
# Normalize using Counts Per Million (CPM)
def cpm_normalization(counts):
return (counts / counts.sum(axis=0)) * 1e6
df_cpm = cpm_normalization(df)
# Log2 transformation for downstream analysis
df_log2 = np.log2(df_cpm + 1) # Add pseudocount to avoid log(0)
# Plot library sizes
plt.figure(figsize=(10, 6))
library_sizes.plot(kind='bar')
plt.title('Library Sizes per Sample')
plt.ylabel('Total Counts')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# Plot distribution of gene expression
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.histplot(df_log2.values.flatten(), bins=50)
plt.title('Distribution of Log2(CPM+1)')
plt.xlabel('Log2(CPM+1)')
plt.subplot(1, 2, 2)
sns.boxplot(data=df_log2.T)
plt.title('Sample Expression Distribution')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# Calculate correlation matrix
correlation_matrix = df_log2.corr()
# Plot correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0.9)
plt.title('Sample Correlation Heatmap')
plt.tight_layout()
plt.show()
# PCA for sample clustering
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_log2.T)
# Perform PCA
pca = PCA()
pca_result = pca.fit_transform(scaled_data)
# Plot PCA
plt.figure(figsize=(10, 8))
plt.scatter(pca_result[:, 0], pca_result[:, 1], alpha=0.7)
for i, sample in enumerate(df_log2.columns):
plt.annotate(sample, (pca_result[i, 0], pca_result[i, 1]))
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%})')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%})')
plt.title('PCA of Samples')
plt.grid(True)
plt.show()
# Calculate mean and variance
mean_expression = df_log2.mean(axis=1)
variance_expression = df_log2.var(axis=1)
# Identify top variable genes
top_variable_genes = variance_expression.nlargest(50).index
print("Top 10 most variable genes:")
print(top_variable_genes[:10])
# Plot mean-variance relationship
plt.figure(figsize=(10, 6))
plt.scatter(mean_expression, variance_expression, alpha=0.5)
plt.xlabel('Mean Expression (Log2(CPM+1))')
plt.ylabel('Variance')
plt.title('Mean-Variance Relationship')
plt.show()
# Save processed data
df_log2.to_csv('GSE154881_processed_log2cpm.csv')
df_cpm.to_csv('GSE154881_processed_cpm.csv')
# Save metadata about processing
processing_info = {
'original_shape': df.shape,
'processed_shape': df_log2.shape,
'genes_removed': df.shape[0] - df_log2.shape[0],
'normalization_method': 'CPM + Log2',
'pseudocount': 1
}
import json
with open('processing_metadata.json', 'w') as f:
json.dump(processing_info, f, indent=4)
# This would require sample metadata - adjust based on your experimental design
# Example assuming two groups: control vs treatment
# You'll need to define your sample groups based on the actual experimental design
# groups = {'control': ['sample1', 'sample2'], 'treatment': ['sample3', 'sample4']}
# For actual DE analysis, consider using dedicated packages:
# import DESeq2 (via rpy2) or edgeR for proper statistical analysis
def process_rna_seq_data(file_path):
"""
Complete pipeline for processing RNA-seq count data
"""
# Load data
df = pd.read_csv(file_path, sep='\\\\t', index_col=0)
# Clean data
df.index = df.index.str.split('.').str[0]
df = df[(df > 0).any(axis=1)]
# Normalize
df_cpm = (df / df.sum(axis=0)) * 1e6
df_log2 = np.log2(df_cpm + 1)
return df, df_cpm, df_log2
# Run the complete analysis
raw_df, cpm_df, log2_df = process_rna_seq_data('GSE154881_raw_counts_GRCh38.p13_NCBI.tsv')