python处理tsv文件

现有一个矩阵(tsv文件),行名为ENSG开头+数字编号
前五行五列情况如图

img


想要读取output.json文件中如图的对应关系,把ENSG开头+数字编号替换为后面的代号内容
例如:行名为ENSG00000186092 替换为OR4F5

img


其中output.json中各个列使用\t分隔
输出名称替换后的矩阵(tsv文件)到指定路径

    保留在json中不存在的数据:
    import pandas as pd
    json_data = pd.read_json("xxx.json", typ='series')
    json_dict = json_data.to_dict()
    df_chunk = pd.read_csv("xxx.tsv", sep='\t', chunksize=1000)
    df_chunk_list = []
    for chunk in df_chunk:
        chunk['Ensembl_ID'] = chunk['Ensembl_ID'].apply(lambda x: x.split(".")[0])
        for index, row in chunk.iterrows():
            try:
                chunk.loc[index, 'Ensembl_ID'] = json_dict[row['Ensembl_ID']]
            except:
                pass
        df_chunk_list.append(chunk)
    result_Df = pd.concat(df_chunk_list)
    result_Df.to_csv('result.tsv', sep='\t', index=False)

    不保留在json中不存在的数据:
    import pandas as pd
    json_data = pd.read_json("xxx..json", typ='series')
    json_dict = json_data.to_dict()
    df_chunk = pd.read_csv("xxx..tsv", sep='\t', chunksize=1000)
    df_chunk_list = []
    for i, chunk in enumerate(df_chunk):
        chunk['Ensembl_ID'] = chunk['Ensembl_ID'].apply(lambda x: x.split(".")[0])
        for index, row in chunk.iterrows():
            try:
                chunk.loc[index, 'Ensembl_ID'] = json_dict[row['Ensembl_ID']]
            except:
                chunk.drop(index=[index], inplace=True)
        df_chunk_list.append(chunk)
    result_Df = pd.concat(df_chunk_list)
    result_Df.to_csv('result.tsv', sep='\t', index=False)

结果文件什么是样子的呢?举个例子看看


#!/usr/bin/env python3
# -*- coding: utf-8 -*-
 
 
"""
create_author : python处理tsv文件
create_time   : 2022-9-23
program       : *_* .tsv file handler *_*
"""
 
import codecs
 
 
class TSV(object):
    """
    .tsv file's handler.
    """
 
    def __init__(self, file):
        """
        TSV init.
        :param file: .tsv file to handle.
        """
        self.file = file
 
    def __repr__(self):
        return "File {file} under handling......".format(file=self.file)
 
    def tsv(self):
        """
        .tsv file's column definition and data check.
        :return: List.
            lines data from [file] row by row in dict format.
        """
        with codecs.open(self.file, 'r', "utf-8") as f:
            line = f.readline()
            data = []
            head = []
            while line:
                if line.isspace():
                    line = f.readline()
                    continue
                elif not line.isspace():
                    # to be compatible between OS
                    head = line.rstrip("\r\n").split('\t')
                    line = f.readline()
                    break
            while line:
                if line.isspace():
                    line = f.readline()
                    continue
                elif not line.isspace():
                    body = line.rstrip("\r\n").split('\t')
                    rows = zip(head, body)
                    tsv_dic = {}
                    for (head_sub, body_sub) in list(rows):
                        tsv_dic[head_sub] = body_sub
                    data.append(tsv_dic)
                    line = f.readline()
            return data
 
 
if __name__ == "__main__":
    with codecs.open("tsv", 'w', "utf-8") as f:
        rows = """
        Id\tContent
        1\tContent1
        2\tContent2
        3\tContent3
        4\tContent4
        1024\tContent1024
        """
        f.writelines(rows.replace(' ', ''))
    TSV_Tester = TSV(file="tsv")
    print(TSV_Tester.tsv())