import sys
import re
in_tsv = open(sys.argv[1], 'r')
in_gtf = open(sys.argv[2], 'r')

pat = re.compile('(chr.{1,2}):g\.([0-9]{1,100})[ATCGdi_].')
pat_gene = re.compile('.gene_name "(.*?)";.')
i = 0
bed_list = []
tsv_dict = {}

# kick out mutations occur in UTR or intron and get the information
for line in in_tsv :
    line_list = line.strip().split('\t') 
    utr = line_list[2][2:11] 
    intron = line_list[2][:6] 
    if i :
        if utr != 'Prime UTR' and intron != 'Intron' :
            col_ls = line_list[2].split()
            if line_list[2]:
                if line_list[2][0] == 'M' or line_list[2][0] == 'F' :
                    gene = col_ls[1]
                else :
                    gene = col_ls[2]
                if gene not in tsv_dict.keys():                
                    tsv_dict[gene] = []
                tsv_dict[gene].append(line_list[0])
    i += 1

# write the information into a GFF type file
out_gff = open(sys.argv[3],'w')
i = 0
for line in in_gtf :
    line_list = line.strip().split('\t')
    if line_list[2] == 'CDS' :
        match_gene = pat_gene.search(line_list[8])
        gene = match_gene.group(1)
        if gene in tsv_dict.keys() :
            i += 1
            for info in tsv_dict[gene] :
                match = pat.search(info)
                site = int(match.group(2))
                chr = match.group(1)
                if int(line_list[3]) < site < int(line_list[4]) :
                    line_list[2] = info+'|'+gene+'|'+ line_list[3] +'|'+line_list[4]+'|'+line_list[6]+'|'+line_list[7]
                    line_list[-1] = '.'
                    line_list[1] = '.'
                    out_gff.write('\t'.join(line_list) + '\n')
    
print i            
in_gtf.close()
in_tsv.close()
out_gff.close()