python 对文件头部进行分类

输入文件夹路径对文件夹里的文件分类 不是根据后缀名 是文件头部进制分类

我只实现了文件类型识别 不会写分类

有带佬吗

可以直接将代码发我

文件头部指的是什么

https://blog.csdn.net/privateobject/article/details/78069500

上面的直接改一下ok了

所谓文件头部分类,就是你以二进制方式把文件的头几个字节读出来
然后写一组if判断一下到底跟什么格式是一致的
具体每种格式文件头是什么可以去网上查,都是公开的

pip install python-magic

>>> import magic

>>> magic.from_file('iceland.jpg')
'JPEG image data, JFIF standard 1.01'

>>> magic.from_file('iceland.jpg', mime=True)
'image/jpeg'

>>> magic.from_file('greenland.png')
'PNG image data, 600 x 1000, 8-bit colormap, non-interlaced'

>>> magic.from_file('greenland.png', mime=True)
'image/png'

我理解的就是这样的 文件分类首先得要配置不同文件分类策略(conf配置好)
最后维护一个字典分配完成后的{“分类名”:[文件1, 文件2, .....]}
如果要分类后放在某个盘 那就动态创建好目录然后将当前判断的文件移动到指定目录下。
至于这么扫描文件夹下的所有文件结合你自己的代码
大体就是这样,不知道符合你的不要求不


import os

CONFIG = "classfile.conf"
def create_default_config():
        with open(CONFIG, "w") as conffile:
            conffile.write(
                           "Music: mp3, aac, flac, ogg, wma, m4a, aiff, wav, amr\n" +
                           "Videos: flv, ogv, avi, mp4, mpg, mpeg, 3gp, mkv, ts, webm, vob, wmv\n" +
                           "Pictures: png, jpeg, gif, jpg, bmp, svg, webp, psd, tiff\n" +
                           "Archives: rar, zip, 7z, gz, bz2, tar, dmg, tgz, xz, iso, cpio\n" +
                           "Documents: txt, pdf, doc, docx, odf, xls, xlsv, xlsx, " +
                           "ppt, pptx, ppsx, odp, odt, ods, md, json, csv\n" +
                           "Books: mobi, epub, chm\n" +
                           "DEBPackages: deb\n" +
                           "Programs: py, exe, msi\n" +
                           "RPMPackages: rpm")
        print("CONFIG file created at: "+CONFIG)

FILE_TYPE =  {
        "68746D6C3E": 'html',
        "d0cf11e0a1b11ae10000":'xls',
        "44656C69766572792D64":'eml',
        'ffd8ffe000104a464946':'jpg',
        '89504e470d0a1a0a0000':'png',
        '47494638396126026f01':'gif',
        '49492a00227105008037':'tif',
        '424d228c010000000000':'bmp',
        '424d8240090000000000':'bmp',
        '424d8e1b030000000000':'bmp',
        '41433130313500000000':'dwg',
        '3c21444f435459504520':'html',
        '3c21646f637479706520':'htm',
        '48544d4c207b0d0a0942':'css',
        '696b2e71623d696b2e71':'js',
        '7b5c727466315c616e73':'rtf',
        '38425053000100000000':'psd',
        '46726f6d3a203d3f6762':'eml',
        'd0cf11e0a1b11ae10000':'doc',
        '5374616E64617264204A':'mdb',
        '252150532D41646F6265':'ps',
        '255044462d312e350d0a':'pdf',
        '2e524d46000000120001':'rmvb',
        '464c5601050000000900':'flv',
        '00000020667479706d70':'mp4',
        '49443303000000002176':'mp3',
        '000001ba210001000180':'mpg',
        '3026b2758e66cf11a6d9':'wmv',
        '52494646e27807005741':'wav',
        '52494646d07d60074156':'avi',
        '4d546864000000060001':'mid',
        '504b0304140000080044':'zip',
        '504b03040a0000080000':'zip',
        '504b03040a0000000000':'zip',
        '526172211a0700cf9073':'rar',
        '235468697320636f6e66':'ini',
        '504b03040a0000000000':'jar',
        '4d5a9000030000000400':'exe',
        '3c25402070616765206c':'jsp',
        '4d616e69666573742d56':'mf',
        '3c3f786d6c2076657273':'xml',
        '494e5345525420494e54':'sql',
        '7061636b616765207765':'java',
        '406563686f206f66660d':'bat',
        '1f8b0800000000000000':'gz',
        '6c6f67346a2e726f6f74':'properties',
        'cafebabe0000002e0041':'class',
        '49545346030000006000':'chm',
        '04000000010000001300':'mxp',
        '504b0304140006000800':'docx',
        '6431303a637265617465':'torrent',
        '64656620746f74616c61': 'py'
        }

formats = {}
def checkconfig():
    """ create a default config if not available """
    if not os.path.exists(CONFIG):
        create_default_config()

    with open(CONFIG, 'r') as file:
        for items in file:
            spl = items.replace('\n', '').split(':')
            key = spl[0].replace(" ", "")
            val = spl[1].replace(" ", "")
            formats[key] = val
    return

def fintype(full_path):
    try:
        image_data = open(full_path, "rb").read(1024)
    except IOError:
        return "Incorrect Request :( !!!"

    if not os.path.exists(CONFIG):
        create_default_config()
    with open(CONFIG, 'r') as file:
        for items in file:
            spl = items.replace('\n', '').split(':')
            key = spl[0].replace(" ", "")
            val = spl[1].replace(" ", "")
            formats[key] = val
    header_byte = bytearray(image_data[0:10]).hex()

    is_type = FILE_TYPE.get(header_byte, None)
    return is_type


if __name__ == '__main__':
    filepath = [
        "/Users/kingfei/Desktop/DataManageBlog/ceshi/tupian",
        "/Users/kingfei/Desktop/DataManageBlog/flaskr/auth.py",
        "/Users/kingfei/Desktop/DataManageBlog/ceshi/LICENSE",
    ]

    dic = {}
    for path in filepath:
        is_type = fintype(path)
        if is_type != None:
            for k, v in formats.items():
                print(k, v)
                if is_type in v:
                    if not dic.get(k):
                        dic[k] = []
                        dic[k].append(path)
                    else:
                        dic[k].append(path)
        elif not dic.get("unknow"):
            dic['unknow'] = []
            dic['unknow'].append(path)
        else:
            dic['unknow'].append(path)


    print(dic)