现有一个矩阵(tsv文件),行名为ENSG开头+数字编号
前五行五列情况如图
保留在json中不存在的数据:
import pandas as pd
json_data = pd.read_json("xxx.json", typ='series')
json_dict = json_data.to_dict()
df_chunk = pd.read_csv("xxx.tsv", sep='\t', chunksize=1000)
df_chunk_list = []
for chunk in df_chunk:
chunk['Ensembl_ID'] = chunk['Ensembl_ID'].apply(lambda x: x.split(".")[0])
for index, row in chunk.iterrows():
try:
chunk.loc[index, 'Ensembl_ID'] = json_dict[row['Ensembl_ID']]
except:
pass
df_chunk_list.append(chunk)
result_Df = pd.concat(df_chunk_list)
result_Df.to_csv('result.tsv', sep='\t', index=False)
不保留在json中不存在的数据:
import pandas as pd
json_data = pd.read_json("xxx..json", typ='series')
json_dict = json_data.to_dict()
df_chunk = pd.read_csv("xxx..tsv", sep='\t', chunksize=1000)
df_chunk_list = []
for i, chunk in enumerate(df_chunk):
chunk['Ensembl_ID'] = chunk['Ensembl_ID'].apply(lambda x: x.split(".")[0])
for index, row in chunk.iterrows():
try:
chunk.loc[index, 'Ensembl_ID'] = json_dict[row['Ensembl_ID']]
except:
chunk.drop(index=[index], inplace=True)
df_chunk_list.append(chunk)
result_Df = pd.concat(df_chunk_list)
result_Df.to_csv('result.tsv', sep='\t', index=False)
结果文件什么是样子的呢?举个例子看看
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
create_author : python处理tsv文件
create_time : 2022-9-23
program : *_* .tsv file handler *_*
"""
import codecs
class TSV(object):
"""
.tsv file's handler.
"""
def __init__(self, file):
"""
TSV init.
:param file: .tsv file to handle.
"""
self.file = file
def __repr__(self):
return "File {file} under handling......".format(file=self.file)
def tsv(self):
"""
.tsv file's column definition and data check.
:return: List.
lines data from [file] row by row in dict format.
"""
with codecs.open(self.file, 'r', "utf-8") as f:
line = f.readline()
data = []
head = []
while line:
if line.isspace():
line = f.readline()
continue
elif not line.isspace():
# to be compatible between OS
head = line.rstrip("\r\n").split('\t')
line = f.readline()
break
while line:
if line.isspace():
line = f.readline()
continue
elif not line.isspace():
body = line.rstrip("\r\n").split('\t')
rows = zip(head, body)
tsv_dic = {}
for (head_sub, body_sub) in list(rows):
tsv_dic[head_sub] = body_sub
data.append(tsv_dic)
line = f.readline()
return data
if __name__ == "__main__":
with codecs.open("tsv", 'w', "utf-8") as f:
rows = """
Id\tContent
1\tContent1
2\tContent2
3\tContent3
4\tContent4
1024\tContent1024
"""
f.writelines(rows.replace(' ', ''))
TSV_Tester = TSV(file="tsv")
print(TSV_Tester.tsv())