This shows you the differences between two versions of the page.
Both sides previous revision Previous revision Next revision | Previous revision | ||
tech [2019/02/15 19:54] fplmarques |
tech [2019/03/29 13:34] (current) fplmarques [Extract features from NCBI/GenBank annotation] |
||
---|---|---|---|
Line 6: | Line 6: | ||
</menu> | </menu> | ||
- | ======= Tools for Molecular Sistematics ======= | + | ======= Tools for Molecular Systematics ======= |
------- | ------- | ||
+ | ===== Usefull scripts ===== | ||
+ | ==== Extract features from NCBI/GenBank annotation ==== | ||
+ | |||
+ | The code below extracts nucleotide sequences from *.gb files and outputs the CDS found in fasta format. | ||
+ | |||
+ | <code python | extract_CDS.py> | ||
+ | # | ||
+ | ## Script modified from https://www.biostars.org/p/230441/ | ||
+ | # This script extract CDS nucleotide sequences from mitochondrial refseq genomes | ||
+ | # usage: python extract_CDS.py input_file.gb output_file.fas | ||
+ | # | ||
+ | from Bio import SeqIO | ||
+ | import sys | ||
+ | |||
+ | with open(sys.argv[2], 'w') as nfh: | ||
+ | for rec in SeqIO.parse(sys.argv[1], "genbank"): | ||
+ | if rec.features: | ||
+ | for feature in rec.features: | ||
+ | if feature.type == "CDS": | ||
+ | nfh.write(">%s from %s\n%s\n" % ( | ||
+ | feature.qualifiers['gene'][0], | ||
+ | rec.name, | ||
+ | feature.location.extract(rec).seq)) | ||
+ | |||
+ | </code> | ||
+ | |||
+ | |||
+ | The code below extracts aminoacids sequences of CDS from *.gb files and outputs the CDS found in fasta format. | ||
+ | |||
+ | <code python | extract_CDSaa.py> | ||
+ | # | ||
+ | ## Script modified from https://www.biostars.org/p/230441/ | ||
+ | # This script extract CDS sequences of aminoacids from mitochondrial refseq genomes | ||
+ | # usage: python extract_CDSaa.py input_file.gb output_file.fas | ||
+ | # | ||
+ | from Bio import SeqIO | ||
+ | import sys | ||
+ | |||
+ | with open(sys.argv[2], 'w') as ofh: | ||
+ | for seq_record in SeqIO.parse(sys.argv[1], 'genbank'): | ||
+ | for seq_feature in seq_record.features: | ||
+ | if seq_feature.type=="CDS": | ||
+ | assert len(seq_feature.qualifiers['translation'])==1 | ||
+ | ofh.write(">%s from %s\n%s\n" % ( | ||
+ | seq_feature.qualifiers['gene'][0], | ||
+ | seq_record.name, | ||
+ | seq_feature.qualifiers['translation'][0])) | ||
+ | |||
+ | </code> | ||
+ | |||
+ | The code below extracts tRNA sequences from *.gb files and outputs the tRNA sequences found in fasta format. | ||
+ | |||
+ | <code python | extract_tRNA.py> | ||
+ | # | ||
+ | ## Script modified from https://www.biostars.org/p/230441/ | ||
+ | # This script extract tRNA sequences from mitochondrial refseq genomes | ||
+ | # usage: python extract_tRNA.py input_file.gb output_file.fas | ||
+ | # | ||
+ | from Bio import SeqIO | ||
+ | import sys | ||
+ | |||
+ | with open(sys.argv[2], 'w') as nfh: | ||
+ | for rec in SeqIO.parse(sys.argv[1], "genbank"): | ||
+ | if rec.features: | ||
+ | for feature in rec.features: | ||
+ | if feature.type == "tRNA": | ||
+ | nfh.write(">%s from %s\n%s\n" % ( | ||
+ | feature.qualifiers['product'][0], | ||
+ | rec.name, | ||
+ | feature.location.extract(rec).seq)) | ||
+ | |||
+ | </code> | ||
+ | |||
+ | The code below extracts rRNA sequences from *.gb files and outputs the rRNA sequences found in fasta format. | ||
+ | |||
+ | <code python | extract_rRNA.py> | ||
+ | # | ||
+ | ## Script modified from https://www.biostars.org/p/230441/ | ||
+ | # This script extract rRNA sequences from mitochondrial refseq genomes | ||
+ | # usage: python extract_rDNA.py input_file.gb output_file.fas | ||
+ | # | ||
+ | from Bio import SeqIO | ||
+ | import sys | ||
+ | |||
+ | with open(sys.argv[2], 'w') as nfh: | ||
+ | for rec in SeqIO.parse(sys.argv[1], "genbank"): | ||
+ | if rec.features: | ||
+ | for feature in rec.features: | ||
+ | if feature.type == "rRNA": | ||
+ | nfh.write(">%s from %s\n%s\n" % ( | ||
+ | feature.qualifiers['product'][0], | ||
+ | rec.name, | ||
+ | feature.location.extract(rec).seq)) | ||
+ | |||
+ | </code> | ||
+ | |||
+ | As the script requires individual files, you can automate sequencial runs by using the following script: | ||
+ | |||
+ | <code>for file in *.gb ; do python extract_*.py ${file} ${file%.*}.fas ; done </code> | ||
+ | |||
+ | This code extracts individual genomes from concatenated genomes in GenBank (*.gb) format. | ||
+ | |||
+ | <code perl | get_individual_genomes.pl> | ||
+ | #!/usr/bin/perl | ||
+ | # | ||
+ | # usage: | ||
+ | # perl get_individual_genomes.pl input_file.gb | ||
+ | # | ||
+ | # | ||
+ | $file = $ARGV[0]; | ||
+ | open (FILE, $file); | ||
+ | while (<FILE>) { | ||
+ | if ($_ =~ m/^LOCUS\s+(\w+).*/){ | ||
+ | $output_file = "$1\.gb"; | ||
+ | open($out, '>', $output_file); | ||
+ | print $out "$_"; | ||
+ | } | ||
+ | if (($_ =~ m/^[\w|\s|\d].*/) ne ($_ =~ m/^LOCUS.*/)){ | ||
+ | print $out "$_"; | ||
+ | } | ||
+ | if ($_ =~ m/^\/\/\n/){ | ||
+ | print $out "$_"; | ||
+ | close ($out); | ||
+ | } | ||
+ | } | ||
+ | close (FILE); | ||
+ | </code> | ||
===== Sequence submission to the NCBI/GenBank ===== | ===== Sequence submission to the NCBI/GenBank ===== | ||