import sys import re in_tsv = open(sys.argv[1], 'r') in_gtf = open(sys.argv[2], 'r') pat = re.compile('(chr.{1,2}):g\.([0-9]{1,100})[ATCGdi_].') pat_gene = re.compile('.gene_name "(.*?)";.') i = 0 bed_list = [] tsv_dict = {} # kick out mutations occur in UTR or intron and get the information for line in in_tsv : line_list = line.strip().split('\t') utr = line_list[2][2:11] intron = line_list[2][:6] if i : if utr != 'Prime UTR' and intron != 'Intron' : col_ls = line_list[2].split() if line_list[2]: if line_list[2][0] == 'M' or line_list[2][0] == 'F' : gene = col_ls[1] else : gene = col_ls[2] if gene not in tsv_dict.keys(): tsv_dict[gene] = [] tsv_dict[gene].append(line_list[0]) i += 1 # write the information into a GFF type file out_gff = open(sys.argv[3],'w') i = 0 for line in in_gtf : line_list = line.strip().split('\t') if line_list[2] == 'CDS' : match_gene = pat_gene.search(line_list[8]) gene = match_gene.group(1) if gene in tsv_dict.keys() : i += 1 for info in tsv_dict[gene] : match = pat.search(info) site = int(match.group(2)) chr = match.group(1) if int(line_list[3]) < site < int(line_list[4]) : line_list[2] = info+'|'+gene+'|'+ line_list[3] +'|'+line_list[4]+'|'+line_list[6]+'|'+line_list[7] line_list[-1] = '.' line_list[1] = '.' out_gff.write('\t'.join(line_list) + '\n') print i in_gtf.close() in_tsv.close() out_gff.close()