Handling FASTA
& FASTQ
with Screed Library¶
import screed
with screed.open("../data/Haemophilus_influenzae.fasta") as seqfile:
for read in seqfile:
seq = read.sequence
name = read.name
print(name)
CP005967.1 Haemophilus influenzae KR494, complete genome
seq[:500]
'AACCGAAATTACAGTGCATGGACGCACAAAATCTGATGGTTATCGTGCTGATAGAATTAATTGGAAAAAAATTGGTAAAGTCCGAGAGCGTTTATCCATTCCTGTTATTGCTAACGGAGAAATTTGGCATTGGCAAGATGGTCAAGATTGCTTATCTCAAACAGGTTGTCAGGATTTAATGGTGGGACGAGGTGCATTGAATATTCCGAACTTAAGCCATGTTCTGAAATCAAATGCAGAAAAAATGCCTTGGAATGAGATTCAAAAAATCTTGCAAAAATATGCGAATGTTGAAAATGAATATGGCAGCGGTTTTTACCATGTGGCACGAATTAAACAATGGTTACGTTATTTGAATAAGGAATATGATGAGGCGAACCAAGAGTTTGATAAGATTAAGACTTGCCAAACTGCTGAAGATTTGAAATTACGGTTAAATGATAAATAAAAAACCTGCTAATCAGCAGGTTTTCTTTTTCTAAATTATTTAAAAATTCACC'
len(seq)
1856176
Template for Handling FASTA and FASTQ with Screed¶
import screed # A Python library for reading FASTA and FASQ file format.
def readFastaFile(inputfile):
"""
Reads and returns file as FASTA format with special characters removed.
"""
with screed.open(inputfile) as seqfile:
for read in seqfile:
seq = read.sequence
return seq
# read data
seqs = readFastaFile("../data/Haemophilus_influenzae.fasta")
seqs[:200]
'AACCGAAATTACAGTGCATGGACGCACAAAATCTGATGGTTATCGTGCTGATAGAATTAATTGGAAAAAAATTGGTAAAGTCCGAGAGCGTTTATCCATTCCTGTTATTGCTAACGGAGAAATTTGGCATTGGCAAGATGGTCAAGATTGCTTATCTCAAACAGGTTGTCAGGATTTAATGGTGGGACGAGGTGCATTGA'
import screed # A Python library for reading FASTA and FASQ file format.
def readFastqFile(inputfile):
"""
Reads and returns file as FASTA format with special characters removed.
"""
with screed.open(inputfile) as seqfile:
for read in seqfile:
seq = read.sequence
return seq
seqs = readFastqFile("../data/SRR835775_1.first1000.fastq")
seqs
'GTTGGCTGCCCCTGCAGGTCCCTGTCACCTCTCACATGTCCCTGCCTAATCTTGCAGGTCCCAGAGAACTACTTCTATGTGCCAGACCTGGGCCAGGTGC'