用python提取gff3中的ID和以及根据start和end查找相应的序列,以fasta形式输出

用python提取gff3中的ID和以及根据start和end查找相应的序列,以fasta形式输出

参考一下:

from Bio import SeqIO

# 读取GFF3文件并提取需要的信息
gff_file = "example.gff3"
fasta_file = "example.fasta"

seq_dict = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta"))

with open(gff_file) as f:
    for line in f:
        if not line.startswith("#"):
            fields = line.strip().split("\t")
            if fields[2] == "gene":
                seq_id = fields[0]
                start = int(fields[3])
                end = int(fields[4])
                strand = fields[6]
                gene_id = fields[8].split(";")[0].split("=")[1]
                seq = seq_dict[seq_id][start-1:end].seq
                if strand == "-":
                    seq = seq.reverse_complement()
                print(f">{gene_id}\n{seq}")