如何用Python批量提取PDF简历的信息,提取所需信息,并写入到Excel中?
提取「姓名」、「学校」、「学历」、「电话」、「邮箱」这5项信息
import re
import pdfplumber
import xlwt
import glob
path = 'D:\Desktop\Sample\*.pdf'
list1 = glob.glob(path) # 目录下所有PDF文件以list形式储存
def collect_single_info(path):
print(path) # 查看文件路径及文件名,调试用,正式代码可删
with pdfplumber.open(path) as pdf:
for page in pdf.pages:
first_page = pdf.pages[0]
# print(first_page.extract_text()) # 查看简历正文首页,调试用,正式代码可删
try:
school = re.findall('\S+大学', first_page.extract_text())[0] # 提取学校名称
str_school = str(school)
print(str_school)
except Exception as error:
str_school = str('未能获取学校名称,需人工复核')
print(str_school, error)
if '博士' in first_page.extract_text(): # 提取学历
education = re.findall('博士', first_page.extract_text())[0] # 先判断是否为博士
str_education = str(education)
print(str_education[2:-2])
elif '硕士' in first_page.extract_text():
education = re.findall('硕士', first_page.extract_text())[0] # 其次判断是否为硕士
str_education = str(education)
print(str_education)
elif '本科' in first_page.extract_text():
education = re.findall('本科', first_page.extract_text())[0] # 最后判断是否为本科
str_education = str(education)
print(str_education)
else:
str_education = str('学历未知,需人工复核简历') # 如简历中完全没有注明学历,则提示人工复核
print(str_education)
try:
# phone_number = re.findall(r"(?<=\D)1[34789]\d{9}", first_page.extract_text())[0] # 提取电话
# https://baike.baidu.com/item/%E6%89%8B%E6%9C%BA%E5%8F%B7%E7%A0%81/1417348 参考国内号段规则
phone_number = re.findall(r"1[35678]\d{9}", first_page.extract_text())[0]
# 198、199开头的手机号易与80后、90后的身份证号部分重合,故而此处直接转为人工复核
str_phone_number = str(phone_number)
except Exception as error:
str_phone_number = str('未能获取电话,需人工复核')
print(str_phone_number)
try:
mail = re.findall('\w+@\w+.com', first_page.extract_text())[0] # 提取邮箱
str_mail = str(mail)
print(str_mail)
except Exception as error:
print('未能获取邮箱,需人工复核')
break
# collect_info(list1[19]) # 输出「单个提取」结果
def collect_multi_info(path):
table = []
for table in list1:
collect_single_info(table)
collect_multi_info(path) # 输出「全部提取」结果
wb = xlwt.Workbook()
sheet1 = wb.add_sheet('Sheet1')
sheet1.write(0, 0, '姓名') # 固定表头
sheet1.write(0, 1, '学校') # 固定表头
sheet1.write(0, 2, '学历') # 固定表头
sheet1.write(0, 3, '电话') # 固定表头
sheet1.write(0, 4, '邮箱') # 固定表头
sheet1.write(1, 0, str_name) # 姓名,后续依次填在下面
sheet1.write(1, 1, str_school) # 学校,同上
sheet1.write(1, 2, str_education) # 学历,同上
sheet1.write(1, 3, str_phone_number) # 电话,同上
sheet1.write(1, 4, str_mail) # 邮箱,同上
wb.save("D:\\Desktop\\Sample\\候选人信息.xls")
print("[System:信息提取完毕!]")
0)不知道怎么把全部PDF内容提取出来之后写到一个excel里
1)姓名没办法提取出来,想到的办法是「提取文件名第一个-前的内容」,但不会写代码……
姓名 | 学校 | 学历 | 电话 | 邮箱 |
---|---|---|---|---|
张三 | 中国XX大学 | 硕士 | 137****1234 | zhangsan@qq.com |
李四 | 华南XX大学 | 本科 | 159****5678 | lisi@163.com |