用python提取gff3中的ID和以及根据start和end查找相应的序列,以fasta形式输出
参考一下:
from Bio import SeqIO
# 读取GFF3文件并提取需要的信息
gff_file = "example.gff3"
fasta_file = "example.fasta"
seq_dict = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta"))
with open(gff_file) as f:
for line in f:
if not line.startswith("#"):
fields = line.strip().split("\t")
if fields[2] == "gene":
seq_id = fields[0]
start = int(fields[3])
end = int(fields[4])
strand = fields[6]
gene_id = fields[8].split(";")[0].split("=")[1]
seq = seq_dict[seq_id][start-1:end].seq
if strand == "-":
seq = seq.reverse_complement()
print(f">{gene_id}\n{seq}")