import sys import re # patterns pat = re.compile('chr.{1,2}:g\.([0-9]{1,100})[ATCGdi_].') pat_sub = re.compile('(chr.{1,2}):g\.([0-9]{1,100})([ATCG]>[ATCG])') pat_del = re.compile('(chr.{1,2}):g\.([0-9]{1,100})del([ATCG]{1,100})') pat_ins = re.compile('(chr.{1,2}):g\.([0-9]{1,100})_([0-9]{1,100})ins([ATCG]{1,100})') # dictionary : codon table for Homo Sapiens codon_table = { 'GCT':'A', 'GCC':'A', 'GCA':'A', 'GCG':'A', 'CGT':'R', 'CGC':'R', 'CGA':'R', 'CGG':'R', 'AGA':'R', 'AGG':'R', 'TCT':'S', 'TCC':'S', 'TCA':'S', 'TCG':'S', 'AGT':'S', 'AGC':'S', 'ATT':'I', 'ATC':'I', 'ATA':'I', 'TTA':'L', 'TTG':'L', 'CTT':'L', 'CTC':'L', 'CTA':'L', 'CTG':'L', 'GGT':'G', 'GGC':'G', 'GGA':'G', 'GGG':'G', 'GTT':'V', 'GTC':'V', 'GTA':'V', 'GTG':'V', 'ACT':'T', 'ACC':'T', 'ACA':'T', 'ACG':'T', 'CCT':'P', 'CCC':'P', 'CCA':'P', 'CCG':'P', 'AAT':'N', 'AAC':'N', 'GAT':'D', 'GAC':'D', 'TGT':'C', 'TGC':'C', 'CAA':'Q', 'CAG':'Q', 'GAA':'E', 'GAG':'E', 'CAT':'H', 'CAC':'H', 'AAA':'K', 'AAG':'K', 'TTT':'F', 'TTC':'F', 'TAT':'Y', 'TAC':'Y', 'ATG':'M', 'TGG':'W', 'TAG':'-', 'TGA':'-', 'TAA':'-' } # function : translate a nucleotide sequence into amino acid sequence def translate(seq, frame) : pro_seq = '' for i in range(frame, len(seq)-3, 3) : pro_seq += codon_table[seq[i:i+3]] if codon_table[seq[i:i+3]] == '-' : break return pro_seq for frm_length in range(8,15) : in_mut = open(sys.argv[1], 'r') out_fi_name = sys.argv[2] + '_' + str(frm_length) + '.fasta' out_fi = open(out_fi_name, 'w') out_fi_list = [] for line in in_mut : line = line.strip() if line[0] == '>' : ab_line = line ab_list = line[1:].strip().split('|') match_sub = pat_sub.search(ab_list[0]) match_del = pat_del.search(ab_list[0]) match_ins = pat_ins.search(ab_list[0]) print ab_line else : aa_line = translate(line, int(ab_list[-1][0])) print aa_line if aa_line[-1] == '-' : aa_line = aa_line[:-1] if match_del : if ab_list[-1][2] == '-' : nlc_mut_site = int(ab_list[3]) - int(match_del.group(2)) aa_mut_site = (nlc_mut_site - int(ab_list[-1][0])) / 3 if len(aa_line) - aa_mut_site >= frm_length : aa_line = aa_line[:aa_mut_site+frm_length] if aa_mut_site + 1 >= frm_length : aa_line = aa_line[aa_mut_site - frm_length + 1:] print aa_line print len(aa_line) for frames in range(len(aa_line) - frm_length + 1) : new_frame = aa_line[frames:frames+frm_length] out_fi_list.append(ab_line) out_fi_list.append(new_frame) else : nlc_mut_site = int(match_del.group(2)) - int(ab_list[2]) aa_mut_site = (nlc_mut_site - int(ab_list[-1][0])) / 3 if len(aa_line) - aa_mut_site >= frm_length : aa_line = aa_line[:aa_mut_site+frm_length] if aa_mut_site + 1 >= frm_length : aa_line = aa_line[aa_mut_site - frm_length + 1:] print aa_line print len(aa_line) for frames in range(len(aa_line) - frm_length + 1) : new_frame = aa_line[frames:frames+frm_length] out_fi_list.append(ab_line) out_fi_list.append(new_frame) elif match_sub : if ab_list[-1][2] == '-' : nlc_mut_site = int(ab_list[3]) - int(match_sub.group(2)) aa_mut_site = (nlc_mut_site - int(ab_list[-1][0])) / 3 if len(aa_line) - aa_mut_site >= frm_length : aa_line = aa_line[:aa_mut_site+frm_length] if aa_mut_site + 1 >= frm_length : aa_line = aa_line[aa_mut_site - frm_length + 1:] print aa_line print len(aa_line) for frames in range(len(aa_line) - frm_length + 1) : new_frame = aa_line[frames:frames+frm_length] out_fi_list.append(ab_line) out_fi_list.append(new_frame) else : nlc_mut_site = int(match_sub.group(2)) - int(ab_list[2]) aa_mut_site = (nlc_mut_site - int(ab_list[-1][0])) / 3 if len(aa_line) - aa_mut_site >= frm_length : aa_line = aa_line[:aa_mut_site+frm_length] if aa_mut_site + 1 >= frm_length : aa_line = aa_line[aa_mut_site - frm_length + 1:] print aa_line print len(aa_line) for frames in range(len(aa_line) - frm_length + 1) : new_frame = aa_line[frames:frames+frm_length] out_fi_list.append(ab_line) out_fi_list.append(new_frame) elif match_ins : if ab_list[-1][2] == '-' : nlc_mut_site = int(ab_list[3]) - int(match_ins.group(3)) aa_mut_site = (nlc_mut_site - int(ab_list[-1][0])) / 3 if len(aa_line) - aa_mut_site >= frm_length : aa_line = aa_line[:aa_mut_site+frm_length] if aa_mut_site + 1 >= frm_length : aa_line = aa_line[aa_mut_site - frm_length + 1:] print aa_line print len(aa_line) for frames in range(len(aa_line) - frm_length + 1) : new_frame = aa_line[frames:frames+frm_length] out_fi_list.append(ab_line) out_fi_list.append(new_frame) else : nlc_mut_site = int(match_ins.group(3)) - int(ab_list[2]) aa_mut_site = (nlc_mut_site - int(ab_list[-1][0])) / 3 if len(aa_line) - aa_mut_site >= frm_length : aa_line = aa_line[:aa_mut_site+frm_length] if aa_mut_site + 1 >= frm_length : aa_line = aa_line[aa_mut_site - frm_length + 1:] print aa_line print len(aa_line) for frames in range(len(aa_line) - frm_length + 1) : new_frame = aa_line[frames:frames+frm_length] out_fi_list.append(ab_line) out_fi_list.append(new_frame) out_fi.write('\n'.join(out_fi_list)+'\n') out_fi.close() in_mut.close()