输入文件夹路径对文件夹里的文件分类 不是根据后缀名 是文件头部进制分类
我只实现了文件类型识别 不会写分类
有带佬吗
可以直接将代码发我
文件头部指的是什么
https://blog.csdn.net/privateobject/article/details/78069500
上面的直接改一下ok了
所谓文件头部分类,就是你以二进制方式把文件的头几个字节读出来
然后写一组if判断一下到底跟什么格式是一致的
具体每种格式文件头是什么可以去网上查,都是公开的
pip install python-magic
>>> import magic
>>> magic.from_file('iceland.jpg')
'JPEG image data, JFIF standard 1.01'
>>> magic.from_file('iceland.jpg', mime=True)
'image/jpeg'
>>> magic.from_file('greenland.png')
'PNG image data, 600 x 1000, 8-bit colormap, non-interlaced'
>>> magic.from_file('greenland.png', mime=True)
'image/png'
我理解的就是这样的 文件分类首先得要配置不同文件分类策略(conf配置好)
最后维护一个字典分配完成后的{“分类名”:[文件1, 文件2, .....]}
如果要分类后放在某个盘 那就动态创建好目录然后将当前判断的文件移动到指定目录下。
至于这么扫描文件夹下的所有文件结合你自己的代码
大体就是这样,不知道符合你的不要求不
import os
CONFIG = "classfile.conf"
def create_default_config():
with open(CONFIG, "w") as conffile:
conffile.write(
"Music: mp3, aac, flac, ogg, wma, m4a, aiff, wav, amr\n" +
"Videos: flv, ogv, avi, mp4, mpg, mpeg, 3gp, mkv, ts, webm, vob, wmv\n" +
"Pictures: png, jpeg, gif, jpg, bmp, svg, webp, psd, tiff\n" +
"Archives: rar, zip, 7z, gz, bz2, tar, dmg, tgz, xz, iso, cpio\n" +
"Documents: txt, pdf, doc, docx, odf, xls, xlsv, xlsx, " +
"ppt, pptx, ppsx, odp, odt, ods, md, json, csv\n" +
"Books: mobi, epub, chm\n" +
"DEBPackages: deb\n" +
"Programs: py, exe, msi\n" +
"RPMPackages: rpm")
print("CONFIG file created at: "+CONFIG)
FILE_TYPE = {
"68746D6C3E": 'html',
"d0cf11e0a1b11ae10000":'xls',
"44656C69766572792D64":'eml',
'ffd8ffe000104a464946':'jpg',
'89504e470d0a1a0a0000':'png',
'47494638396126026f01':'gif',
'49492a00227105008037':'tif',
'424d228c010000000000':'bmp',
'424d8240090000000000':'bmp',
'424d8e1b030000000000':'bmp',
'41433130313500000000':'dwg',
'3c21444f435459504520':'html',
'3c21646f637479706520':'htm',
'48544d4c207b0d0a0942':'css',
'696b2e71623d696b2e71':'js',
'7b5c727466315c616e73':'rtf',
'38425053000100000000':'psd',
'46726f6d3a203d3f6762':'eml',
'd0cf11e0a1b11ae10000':'doc',
'5374616E64617264204A':'mdb',
'252150532D41646F6265':'ps',
'255044462d312e350d0a':'pdf',
'2e524d46000000120001':'rmvb',
'464c5601050000000900':'flv',
'00000020667479706d70':'mp4',
'49443303000000002176':'mp3',
'000001ba210001000180':'mpg',
'3026b2758e66cf11a6d9':'wmv',
'52494646e27807005741':'wav',
'52494646d07d60074156':'avi',
'4d546864000000060001':'mid',
'504b0304140000080044':'zip',
'504b03040a0000080000':'zip',
'504b03040a0000000000':'zip',
'526172211a0700cf9073':'rar',
'235468697320636f6e66':'ini',
'504b03040a0000000000':'jar',
'4d5a9000030000000400':'exe',
'3c25402070616765206c':'jsp',
'4d616e69666573742d56':'mf',
'3c3f786d6c2076657273':'xml',
'494e5345525420494e54':'sql',
'7061636b616765207765':'java',
'406563686f206f66660d':'bat',
'1f8b0800000000000000':'gz',
'6c6f67346a2e726f6f74':'properties',
'cafebabe0000002e0041':'class',
'49545346030000006000':'chm',
'04000000010000001300':'mxp',
'504b0304140006000800':'docx',
'6431303a637265617465':'torrent',
'64656620746f74616c61': 'py'
}
formats = {}
def checkconfig():
""" create a default config if not available """
if not os.path.exists(CONFIG):
create_default_config()
with open(CONFIG, 'r') as file:
for items in file:
spl = items.replace('\n', '').split(':')
key = spl[0].replace(" ", "")
val = spl[1].replace(" ", "")
formats[key] = val
return
def fintype(full_path):
try:
image_data = open(full_path, "rb").read(1024)
except IOError:
return "Incorrect Request :( !!!"
if not os.path.exists(CONFIG):
create_default_config()
with open(CONFIG, 'r') as file:
for items in file:
spl = items.replace('\n', '').split(':')
key = spl[0].replace(" ", "")
val = spl[1].replace(" ", "")
formats[key] = val
header_byte = bytearray(image_data[0:10]).hex()
is_type = FILE_TYPE.get(header_byte, None)
return is_type
if __name__ == '__main__':
filepath = [
"/Users/kingfei/Desktop/DataManageBlog/ceshi/tupian",
"/Users/kingfei/Desktop/DataManageBlog/flaskr/auth.py",
"/Users/kingfei/Desktop/DataManageBlog/ceshi/LICENSE",
]
dic = {}
for path in filepath:
is_type = fintype(path)
if is_type != None:
for k, v in formats.items():
print(k, v)
if is_type in v:
if not dic.get(k):
dic[k] = []
dic[k].append(path)
else:
dic[k].append(path)
elif not dic.get("unknow"):
dic['unknow'] = []
dic['unknow'].append(path)
else:
dic['unknow'].append(path)
print(dic)