import re import sys pat_sub = re.compile('(chr.{1,2}):g\.([0-9]{1,100})([ATCG]>[ATCG])') pat_del = re.compile('(chr.{1,2}):g\.([0-9]{1,100})del([ATCG]{1,100})') pat_ins = re.compile('(chr.{1,2}):g\.([0-9]{1,100})_([0-9]{1,100})ins([ATCG]{1,100})') # a function reversing the sequence def reverse(seq_in) : seq_out = '' for base_in in seq_in : if base_in == 'A' : base_out = 'T' elif base_in == 'T' : base_out = 'A' elif base_in == 'C' : base_out = 'G' else : base_out = 'C' seq_out += base_out return seq_out # mutate the normal sequences into mutated sequences new_fi_list = [] infi_fa = open(sys.argv[1], 'r') for line in infi_fa : line = line.strip() if line[0] == '>' : ab_list = line.strip().split('|') match_sub = pat_sub.search(ab_list[0]) match_del = pat_del.search(ab_list[0]) match_ins = pat_ins.search(ab_list[0]) new_fi_list.append(line) print line else : if match_sub : detail = match_sub.group(3) site = int(match_sub.group(2)) - int(ab_list[2]) if ab_list[-1][2] == '+' : if line[site] == detail[0] : newseq = line[:site] + detail[2] + line[site+1:] else : if line[-site-1] == reverse(detail[0]) : newbase = reverse(detail[2]) newseq = line[:-site-1] + newbase + line[-site:] elif match_del : detail = match_del.group(3) site = int(match_del.group(2)) - int(ab_list[2]) if ab_list[-1][2] == '+' : if line[site] == detail[0] : newseq = line[:site] + line[site+len(detail):] else : if line[-site-1] == reverse(detail[0]) : newseq = line[:-site-len(detail)] + line[-site:] elif match_ins : detail = match_ins.group(4) ins_site = int(match_ins.group(2).strip().split('_')[0]) site = ins_site - int(ab_list[2]) if ab_list[-1][2] == '+' : newseq = line[:site] + detail + line[site:] else : newseq = line[:-site-1] + reverse(detail) + line[-site-1:] if newseq : new_fi_list.append(newseq) print len(newseq) outfi = open(sys.argv[2],'w') outfi.write('\n'.join(new_fi_list)+'\n') infi_fa.close() outfi.close()