DNA Sequence Statistics¶
Reading Data¶
import screed # A Python library for reading FASTA and FASQ file format.
def readFastaFile(inputfile):
"""
Reads and returns file as FASTA format with special characters removed.
"""
with screed.open(inputfile) as seqfile:
for read in seqfile:
seq = read.sequence
return seq
Read FASTA/FASQ File¶
den1 = readFastaFile("../data/den1.fasta")
den2 = readFastaFile("../data/den2.fasta")
den3 = readFastaFile("../data/den3.fasta")
den4 = readFastaFile("../data/den4.fasta")
Statistical Aanalysis¶
Length of a DNA sequence¶
# length of DEN1
len(den1)
10735
# length of DEN2
len(den2)
10723
# length of DEN3
len(den3)
10707
len(den4)
10649
Base composition of a DNA sequence¶¶
def count_base(seq):
base_counts = {}
for base in seq:
if base in base_counts:
base_counts[base] += 1
else:
base_counts[base] = 1
return base_counts
# Let's call the function count_base
counts = count_base(den1)
print(counts)
{'A': 3426, 'G': 2770, 'T': 2299, 'C': 2240}
from collections import Counter
def count_base(seq):
base_counts = Counter(seq)
return base_counts
bases = count_base(den1)
print(bases)
Counter({'A': 3426, 'G': 2770, 'T': 2299, 'C': 2240})
Frequency with Counter¶
from collections import Counter
# Base composition of DEN1
freq1 = Counter(den1)
print(freq1)
Counter({'A': 3426, 'G': 2770, 'T': 2299, 'C': 2240})
# Base composition of DEN2
freq2 = Counter(den2)
print(freq2)
Counter({'A': 3553, 'G': 2713, 'T': 2257, 'C': 2200})
# Base composition of DEN3
freq3 = Counter(den3)
print(freq3)
Counter({'A': 3425, 'G': 2783, 'T': 2281, 'C': 2218})
# Base composition of DEN4
freq4 = Counter(den4)
print(freq2)
Counter({'A': 3553, 'G': 2713, 'T': 2257, 'C': 2200})
GC Content of DNA¶
import screed # A Python library for reading FASTA and FASQ file format.
def readFastaFile(inputfile):
"""
Reads and returns file as FASTA format with special characters removed.
"""
with screed.open(inputfile) as seqfile:
for read in seqfile:
seq = read.sequence
return seq
def calculate_gc_content(seq):
"""
Take DNA sequence as input and calculate the GC content.
"""
no_of_g = seq.count("G")
no_of_c = seq.count("C")
total = no_of_g + no_of_c
gc = total/len(seq) * 100
return gc
# Calculate GC Content of DEN1
den1 = readFastaFile("../data/den1.fasta")
result1 = calculate_gc_content(den1)
print(result1)
46.66977177456917
# Calculate GC Content of DEN2
den2 = readFastaFile("../data/den2.fasta")
result2 = calculate_gc_content(den2)
print(result2)
45.81740184649818
# Calculate GC Content of DEN3
den3 = readFastaFile("../data/den3.fasta")
result3 = calculate_gc_content(den3)
print(result3)
46.707761277668816
# Calculate GC Content of DEN4
den4 = readFastaFile("../data/den4.fasta")
result4 = calculate_gc_content(den4)
print(result4)
47.1217954737534
AT Content of DNA¶
import screed # A Python library for reading FASTA and FASQ file format.
def readFastaFile(inputfile):
"""
Reads and returns file as FASTA format with special characters removed.
"""
with screed.open(inputfile) as seqfile:
for read in seqfile:
seq = read.sequence
return seq
def calculate_at_content(seq):
"""
Take DNA sequence as input and calculate the AT content.
"""
no_of_a = seq.count("A")
no_of_t = seq.count("T")
total = no_of_a + no_of_t
at = total/len(seq) * 100
return at
# Calculate AT Content of DEN1
den1 = readFastaFile("../data/den1.fasta")
result1 = calculate_at_content(den1)
print(result1)
53.33022822543083
# Calculate AT Content of DEN2
den2 = readFastaFile("../data/den2.fasta")
result2 = calculate_at_content(den2)
print(result2)
54.182598153501814
# Calculate AT Content of DEN3
den3 = readFastaFile("../data/den3.fasta")
result3 = calculate_at_content(den3)
print(result3)
53.292238722331184
# Calculate AT Content of DEN4
den4 = readFastaFile("../data/den4.fasta")
result4 = calculate_at_content(den4)
print(result4)
52.8782045262466
A sliding window analysis of GC content¶
import screed # A Python library for reading FASTA and FASQ file format.
def readFastaFile(inputfile):
"""
Reads and returns file as FASTA format with special characters removed.
"""
with screed.open(inputfile) as seqfile:
for read in seqfile:
seq = read.sequence
return seq
def calculate_gc_content(seq):
"""
Take DNA sequence as input and calculate the GC content.
"""
no_of_g = seq.count("G")
no_of_c = seq.count("C")
total = no_of_g + no_of_c
gc = total/len(seq) * 100
return gc
# Calculate the GC content of nucleotides 1-2000 of the Dengue genome(DEN-1)
seq = readFastaFile("../data/den1.fasta")
calculate_gc_content(seq[1:2001])
46.550000000000004
# Calculate the GC content of nucleotides 2001-4000 of the Dengue genome(DEN-1)
seq = readFastaFile("../data/den1.fasta")
calculate_gc_content(seq[2001:4001])
45.2
# Calculate the GC content of nucleotides 4001-6000 of the Dengue genome(DEN-1)
seq = readFastaFile("../data/den1.fasta")
calculate_gc_content(seq[4001:6001])
47.099999999999994
# Calculate the GC content of nucleotides 6001-8000 of the Dengue genome(DEN-1)
seq = readFastaFile("../data/den1.fasta")
calculate_gc_content(seq[6001:8001])
47.85
# Calculate the GC content of nucleotides 8001-10000 of the Dengue genome(DEN-1)
seq = readFastaFile("../data/den1.fasta")
calculate_gc_content(seq[8001:10001])
45.45
# Calculate the GC content of nucleotides 10001-10735 of the Dengue genome(DEN-1)
seq = readFastaFile("../data/den1.fasta")
calculate_gc_content(seq[10001:10735])
50.0
GC content of the non-overlapping sub-sequences of size k¶
def calculate_gc(seq):
"""
Returns percentage of G and C nucleotides in a DNA sequence.
"""
gc = 0
for base in seq:
if base in "GCgc":
gc += 1
else:
gc = 1
return gc/len(seq) * 100
def gc_subseq(seq, k=2000):
"""
Returns GC content of non − overlapping sub− sequences of size k.
The result is a list.
"""
res = []
for i in range(0, len(seq)-k+1, k):
subseq = seq[i:i+k]
gc = calculate_gc(subseq)
res.append(gc)
return gc
seq = readFastaFile("../data/den1.fasta")
gc_subseq(seq, 2000)
0.1
K-mer Analysis¶
def build_kmers(sequence, ksize):
kmers = []
n_kmers = len(sequence) - ksize + 1
for i in range(n_kmers):
kmer = sequence[i:i + ksize]
kmers.append(kmer)
return kmers
seq = readFastaFile("../data/den1.fasta")
# Dimer
km2 = build_kmers(seq, 2)
# Count dimer
from collections import Counter
dimer = Counter(km2)
print(dimer)
Counter({'AA': 1108, 'GA': 976, 'CA': 901, 'AG': 890, 'TG': 832, 'GG': 787, 'AC': 720, 'AT': 708, 'CT': 555, 'TT': 529, 'CC': 523, 'GT': 507, 'GC': 500, 'TC': 497, 'TA': 440, 'CG': 261})
# Trimer
km3 = build_kmers(seq, 3)
# Count trimer
trimer = Counter(km3)
print(trimer)
Counter({'AAA': 388, 'GGA': 354, 'GAA': 346, 'AGA': 334, 'TGG': 321, 'ACA': 294, 'ATG': 292, 'AAG': 280, 'CAA': 273, 'GAG': 249, 'AAC': 237, 'AGG': 237, 'CCA': 236, 'CAG': 227, 'TGA': 218, 'AAT': 203, 'CAT': 203, 'TCA': 202, 'CAC': 198, 'GAC': 197, 'GTG': 192, 'CTG': 186, 'AGC': 184, 'GAT': 184, 'ACC': 176, 'GCA': 169, 'TTG': 162, 'TGT': 160, 'GGG': 160, 'ACT': 156, 'GCT': 144, 'GGT': 141, 'ATA': 141, 'TTT': 139, 'ATT': 138, 'ATC': 137, 'GTT': 136, 'AGT': 135, 'TCT': 134, 'TAG': 133, 'TTC': 133, 'TGC': 133, 'GGC': 132, 'CTA': 131, 'GCC': 129, 'CTC': 121, 'CCT': 121, 'TAT': 118, 'CTT': 116, 'TCC': 112, 'GTC': 106, 'CCC': 106, 'TAA': 101, 'TTA': 95, 'ACG': 94, 'TAC': 88, 'GTA': 73, 'CGT': 71, 'CGA': 70, 'CGG': 69, 'CCG': 60, 'GCG': 58, 'CGC': 51, 'TCG': 49})