>1
qq
>2
wwwwwwwwwwwwww
>3
qqqqqweqeq
>4
rrrrrrrrrrrrrrrrrr
>5
tttttttttttttttttttt
>6
ddssaa
import pandas as pd
import os
leng=1
f = open("111.fna","r")
lines = f.readline()
chr = 0
l = 1
thefile = open("111.fna","r")
while True:
buffer = thefile.read()
if not buffer:
break
l += buffer.count('\n')
thefile.close()
lp = l + 1
h = 1 # 计算行数
while True:
if ">" in lines: # 如果有>号,
chr = chr + 1
lines = f.readline() #则读取下一行
h = h + 1
exec("dic%s=dict()"%chr)
for i in range(len(lines)-leng):
sub = lines[i:i+leng]
exec("x = dic%s.keys()"%chr)
if sub in x:
exec("dic%s[sub] += 1"%chr)
else:
exec("dic%s[sub] = 1"%chr)
exec("df%s=pd.DataFrame([dic%s])"%(chr,chr))
outputpath = "C"+str(chr)+"-"+str(leng)+'NCount.csv'
exec("df%s.to_csv(outputpath,sep=',',index=False,header=True)"%chr)
lines = f.readline()
h = h + 1
if h >= l + 1:
break
f.close()
此时的
dic1={'q': 2}
dic2={'w': 14}
……(前5个字典均正常)……
dic6={'d': 2, 's': 2, 'a': 1}
少一个a。
我用过直接readline,但是总是不能读取最后一行,才先计算行数再计数。
dic6=dic6={'d': 2, 's': 2, 'a': 2}
注:用字典计算字符串频率的方法来自https://ask.csdn.net/questions/7405804?spm=1001.2014.3001.5501
因为最后一行没有回车,你还是减了一个leng
import pandas as pd
from collections import Counter
with open(filename) as f:
i = 0
x = f.readline().replace("\n", '')
while x:
if ">" not in x:
s = f"dic{i}=dict(Counter('" + x + "'))"
exec(s)
exec(f"print(dic{i})")
exec(f"df=pd.DataFrame([dic{i}])")
outputpath = dirname + f"\\C{i}"+'NCount.csv'
ss = "df.to_csv(r'" + outputpath + "',sep=',',encoding='utf-8',index=False,header=True)"
exec(ss)
i += 1
x = f.readline().replace("\n", '')