DNA Sequence Statistics

Reading Data

import screed # A Python library for reading FASTA and FASQ file format.
def readFastaFile(inputfile):
    """
    Reads and returns file as FASTA format with special characters removed.
    """
    with screed.open(inputfile) as seqfile:
        for read in seqfile:
            seq = read.sequence
    return seq

Read FASTA/FASQ File

den1 = readFastaFile("../data/den1.fasta")
den2 = readFastaFile("../data/den2.fasta")
den3 = readFastaFile("../data/den3.fasta")
den4 = readFastaFile("../data/den4.fasta")

Statistical Aanalysis

Length of a DNA sequence

# length of DEN1 
len(den1)
10735
# length of DEN2
len(den2)
10723
# length of DEN3
len(den3)
10707
len(den4)
10649

Base composition of a DNA sequence¶

def count_base(seq): 
    base_counts = {} 
    for base in seq: 
        if base in base_counts: 
            base_counts[base] += 1 
        else: 
            base_counts[base] = 1 
    return base_counts
# Let's call the function count_base
counts = count_base(den1)
print(counts)
{'A': 3426, 'G': 2770, 'T': 2299, 'C': 2240}
from collections import Counter
def count_base(seq): 
    base_counts = Counter(seq)
    return base_counts
bases = count_base(den1)
print(bases)
Counter({'A': 3426, 'G': 2770, 'T': 2299, 'C': 2240})

Frequency with Counter

from collections import Counter
# Base composition of DEN1
freq1 = Counter(den1)
print(freq1)
Counter({'A': 3426, 'G': 2770, 'T': 2299, 'C': 2240})
# Base composition of DEN2
freq2 = Counter(den2)
print(freq2)
Counter({'A': 3553, 'G': 2713, 'T': 2257, 'C': 2200})
# Base composition of DEN3
freq3 = Counter(den3)
print(freq3)
Counter({'A': 3425, 'G': 2783, 'T': 2281, 'C': 2218})
# Base composition of DEN4
freq4 = Counter(den4)
print(freq2)
Counter({'A': 3553, 'G': 2713, 'T': 2257, 'C': 2200})

GC Content of DNA

import screed # A Python library for reading FASTA and FASQ file format.
def readFastaFile(inputfile):
    """
    Reads and returns file as FASTA format with special characters removed.
    """
    with screed.open(inputfile) as seqfile:
        for read in seqfile:
            seq = read.sequence
    return seq

def calculate_gc_content(seq): 
    """
    Take DNA sequence as input and calculate the GC content.
    """
    no_of_g = seq.count("G")
    no_of_c = seq.count("C")
    total = no_of_g + no_of_c 
    gc = total/len(seq) * 100
    
    return gc 
# Calculate GC Content of DEN1
den1 = readFastaFile("../data/den1.fasta")
result1 = calculate_gc_content(den1)
print(result1) 
46.66977177456917
# Calculate GC Content of DEN2
den2 = readFastaFile("../data/den2.fasta")
result2 = calculate_gc_content(den2)
print(result2)
45.81740184649818
# Calculate GC Content of DEN3
den3 = readFastaFile("../data/den3.fasta")
result3 = calculate_gc_content(den3)
print(result3)
46.707761277668816
# Calculate GC Content of DEN4
den4 = readFastaFile("../data/den4.fasta")
result4 = calculate_gc_content(den4)
print(result4)
47.1217954737534

AT Content of DNA

import screed # A Python library for reading FASTA and FASQ file format.
def readFastaFile(inputfile):
    """
    Reads and returns file as FASTA format with special characters removed.
    """
    with screed.open(inputfile) as seqfile:
        for read in seqfile:
            seq = read.sequence
    return seq

def calculate_at_content(seq): 
    """
    Take DNA sequence as input and calculate the AT content.
    """
    no_of_a = seq.count("A")
    no_of_t = seq.count("T")
    total = no_of_a + no_of_t
    at = total/len(seq) * 100
    
    return at 
# Calculate AT Content of DEN1
den1 = readFastaFile("../data/den1.fasta")
result1 = calculate_at_content(den1)
print(result1)
53.33022822543083
# Calculate AT Content of DEN2
den2 = readFastaFile("../data/den2.fasta")
result2 = calculate_at_content(den2)
print(result2)
54.182598153501814
# Calculate AT Content of DEN3
den3 = readFastaFile("../data/den3.fasta")
result3 = calculate_at_content(den3)
print(result3)
53.292238722331184
# Calculate AT Content of DEN4
den4 = readFastaFile("../data/den4.fasta")
result4 = calculate_at_content(den4)
print(result4)
52.8782045262466

A sliding window analysis of GC content

import screed # A Python library for reading FASTA and FASQ file format.
def readFastaFile(inputfile):
    """
    Reads and returns file as FASTA format with special characters removed.
    """
    with screed.open(inputfile) as seqfile:
        for read in seqfile:
            seq = read.sequence
    return seq

def calculate_gc_content(seq): 
    """
    Take DNA sequence as input and calculate the GC content.
    """
    no_of_g = seq.count("G")
    no_of_c = seq.count("C")
    total = no_of_g + no_of_c 
    gc = total/len(seq) * 100
    
    return gc 
# Calculate the GC content of nucleotides 1-2000 of the Dengue genome(DEN-1)
seq = readFastaFile("../data/den1.fasta")
calculate_gc_content(seq[1:2001])
46.550000000000004
# Calculate the GC content of nucleotides 2001-4000 of the Dengue genome(DEN-1)
seq = readFastaFile("../data/den1.fasta")
calculate_gc_content(seq[2001:4001])
45.2
# Calculate the GC content of nucleotides 4001-6000 of the Dengue genome(DEN-1)
seq = readFastaFile("../data/den1.fasta")
calculate_gc_content(seq[4001:6001])
47.099999999999994
# Calculate the GC content of nucleotides 6001-8000 of the Dengue genome(DEN-1)
seq = readFastaFile("../data/den1.fasta")
calculate_gc_content(seq[6001:8001])
47.85
# Calculate the GC content of nucleotides 8001-10000 of the Dengue genome(DEN-1)
seq = readFastaFile("../data/den1.fasta")
calculate_gc_content(seq[8001:10001])
45.45
# Calculate the GC content of nucleotides 10001-10735 of the Dengue genome(DEN-1)
seq = readFastaFile("../data/den1.fasta")
calculate_gc_content(seq[10001:10735])
50.0

GC content of the non-overlapping sub-sequences of size k

def calculate_gc(seq):
    """ 
    Returns percentage of G and C nucleotides in a DNA sequence.
    """
    gc = 0 
    for base in seq:
        if base in "GCgc":
            gc += 1 
        else:
            gc = 1 
    return gc/len(seq) * 100

def gc_subseq(seq, k=2000):
    """
    Returns GC content of non − overlapping sub− sequences of size k.
    The result is a list.
    """
    res = [] 
    for i in range(0, len(seq)-k+1, k):
        subseq = seq[i:i+k]
        gc = calculate_gc(subseq)
        res.append(gc)
    return gc 
    
seq = readFastaFile("../data/den1.fasta")
gc_subseq(seq, 2000)
0.1

K-mer Analysis

def build_kmers(sequence, ksize):
    kmers = []
    n_kmers = len(sequence) - ksize + 1

    for i in range(n_kmers):
        kmer = sequence[i:i + ksize]
        kmers.append(kmer)

    return kmers
seq = readFastaFile("../data/den1.fasta")
# Dimer
km2 = build_kmers(seq, 2)
# Count dimer
from collections import Counter
dimer = Counter(km2)
print(dimer)
Counter({'AA': 1108, 'GA': 976, 'CA': 901, 'AG': 890, 'TG': 832, 'GG': 787, 'AC': 720, 'AT': 708, 'CT': 555, 'TT': 529, 'CC': 523, 'GT': 507, 'GC': 500, 'TC': 497, 'TA': 440, 'CG': 261})
# Trimer 
km3 = build_kmers(seq, 3)
# Count trimer
trimer = Counter(km3)
print(trimer)
Counter({'AAA': 388, 'GGA': 354, 'GAA': 346, 'AGA': 334, 'TGG': 321, 'ACA': 294, 'ATG': 292, 'AAG': 280, 'CAA': 273, 'GAG': 249, 'AAC': 237, 'AGG': 237, 'CCA': 236, 'CAG': 227, 'TGA': 218, 'AAT': 203, 'CAT': 203, 'TCA': 202, 'CAC': 198, 'GAC': 197, 'GTG': 192, 'CTG': 186, 'AGC': 184, 'GAT': 184, 'ACC': 176, 'GCA': 169, 'TTG': 162, 'TGT': 160, 'GGG': 160, 'ACT': 156, 'GCT': 144, 'GGT': 141, 'ATA': 141, 'TTT': 139, 'ATT': 138, 'ATC': 137, 'GTT': 136, 'AGT': 135, 'TCT': 134, 'TAG': 133, 'TTC': 133, 'TGC': 133, 'GGC': 132, 'CTA': 131, 'GCC': 129, 'CTC': 121, 'CCT': 121, 'TAT': 118, 'CTT': 116, 'TCC': 112, 'GTC': 106, 'CCC': 106, 'TAA': 101, 'TTA': 95, 'ACG': 94, 'TAC': 88, 'GTA': 73, 'CGT': 71, 'CGA': 70, 'CGG': 69, 'CCG': 60, 'GCG': 58, 'CGC': 51, 'TCG': 49})