1、需求 有这样一个需求,将采集在 Excel 中的人员信息(包含照片)导入到 Mysql 库。需求很简单,就是读取 Excel 中的数据插入到 Mysql 中的表,问题在于照片怎么读取?
2、分析 对于读取文本数据,直接按行读取即可;对于图片,常用做法是,将 Excel 文件的后缀名改为 zip,然后解压文件,对应文件名目录下有一个\xl\media
的目录,里面便是我们要照片的图片,然而,他的文件名、文件顺序都是乱的,确定不出图片对应的记录,怎么办呢?其实目录中还有这样一个 xml 文件\xl\drawings\drawing1.xml
其中就有图片与记录的对应关系,只要解析 xml 文件就可以了。知道了这个思路,那我们来一步步的实现他。
3、实现 1、复制并修改指定目录下的文件类型名,将excel后缀名修改为.zip
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 def copy_change_file_name (file_path, new_type='.zip' ) : if not isfile_exist(file_path): return '' extend = os.path.splitext(file_path)[1 ] if extend != '.xlsx' and extend != '.xls' : print("It's not a excel file! %s" % file_path) return False file_name = os.path.basename(file_path) new_name = str(file_name.split('.' )[0 ]) + new_type dir_path = os.path.dirname(file_path) new_path = os.path.join(dir_path, new_name) if os.path.exists(new_path): os.remove(new_path) shutil.copyfile(file_path, new_path) return new_path
2、解压文件
1 2 3 4 5 6 7 8 9 10 11 12 13 def unzip_file (zipfile_path) : if not isfile_exist(zipfile_path): return False if os.path.splitext(zipfile_path)[1 ] != '.zip' : print("It's not a zip file! %s" % zipfile_path) return False file_zip = zipfile.ZipFile(zipfile_path, 'r' ) file_name = os.path.basename(zipfile_path) zipdir = os.path.join(os.path.dirname(zipfile_path), str(file_name.split('.' )[0 ])) for files in file_zip.namelist(): file_zip.extract(files, os.path.join(zipfile_path, zipdir)) file_zip.close() return True
3、读取解压后的文件夹,打印图片路径
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 def read_img (zipfile_path) : img_dict = dict() if not isfile_exist(zipfile_path): return False dir_path = os.path.dirname(zipfile_path) file_name = os.path.basename(zipfile_path) pic_dir = 'xl' + os.sep + 'media' pic_path = os.path.join(dir_path, str(file_name.split('.' )[0 ]), pic_dir) file_list = os.listdir(pic_path) for file in file_list: filepath = os.path.join(pic_path, file) print(filepath) img_index = int(re.findall(r'image(\d+)\.' , filepath)[0 ]) img_dict[img_index] = dict(img_index=img_index, img_path=filepath) return img_dict
4、解析xml 文件,获取图片在excel表格中的索引位置信息
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 def get_img_pos_info (zip_file_path, img_dict, img_feature) : """解析xml 文件,获取图片在excel表格中的索引位置信息""" os.path.dirname(zip_file_path) dir_path = os.path.dirname(zip_file_path) file_name = os.path.basename(zip_file_path) xml_dir = 'xl' + os.sep + 'drawings' + os.sep + 'drawing1.xml' xml_path = os.path.join(dir_path, str(file_name.split('.' )[0 ]), xml_dir) image_info_dict = parse_xml(xml_path, img_dict, img_feature=img_feature) return image_info_dict def get_img_info (excel_file_path, img_feature) : if img_feature not in ["img_index" , "img_path" , "img_base64" ]: raise Exception('图片返回参数错误, ["img_index", "img_path", "img_base64"]' ) zip_file_path = copy_change_file_name(excel_file_path) if zip_file_path != '' : if unzip_file(zip_file_path): img_dict = read_img(zip_file_path) image_info_dict = get_img_pos_info(zip_file_path, img_dict, img_feature) return image_info_dict return dict() def parse_xml (file_name, img_dict, img_feature='img_path' ) : image_info = dict() dom_obj = xmldom.parse(file_name) element = dom_obj.documentElement def _f (subElementObj) : for anchor in subElementObj: xdr_from = anchor.getElementsByTagName('xdr:from' )[0 ] col = xdr_from.childNodes[0 ].firstChild.data row = xdr_from.childNodes[2 ].firstChild.data embed = \ anchor.getElementsByTagName('xdr:pic' )[0 ].getElementsByTagName('xdr:blipFill' )[0 ].getElementsByTagName('a:blip' )[0 ].getAttribute('r:embed' ) image_info[(int(row), int(col))] = img_dict.get(int(embed.replace('rId' , '' )), {}).get(img_feature) sub_twoCellAnchor = element.getElementsByTagName("xdr:twoCellAnchor" ) sub_oneCellAnchor = element.getElementsByTagName("xdr:oneCellAnchor" ) _f(sub_twoCellAnchor) _f(sub_oneCellAnchor) return image_info
5、读取包含图片的excel数据
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 def read_excel_info (file_path, img_col_index, img_feature='img_path' ) : img_info_dict = get_img_info(file_path, img_feature) book = xlrd.open_workbook(file_path) sheet = book.sheet_by_index(0 ) head = dict() for i, v in enumerate(sheet.row(0 )): head[i] = v.value info_list = [] for row_num in range(sheet.nrows): d = dict() for col_num in range(sheet.ncols): if row_num == 0 : continue if 'empty:' in sheet.cell(row_num, col_num).__str__(): if col_num in img_col_index: d[head[col_num]] = img_info_dict.get((row_num, col_num)) else : d[head[col_num]] = sheet.cell(row_num, col_num).value else : d[head[col_num]] = sheet.cell(row_num, col_num).value if d: info_list.append(d) return info_list
1 2 3 4 5 6 7 8 9 10 11 import reimport xml.dom.minidom as xmldomimport osimport zipfileimport shutilimport xlrdif __name__ == '__main__' : path = r'E:\\员工信息.xlsx' data = read_excel_info(path, img_col_index=[4 ]) print(data)
完整代码:https://github.com/dddreams/read-excel-image/blob/master/read_users.py
4、处理图片对应不到的问题 有时会出现图片对应不到记录的情况,这是因为 Excel 中图片不在单独的单元格内,指定的列中取不到图片,这就需要调整 Excel 内的数据了,但是如果数据量比较大,调整起来就比较麻烦了。其实,我们可以借助 Excel 宏的功能,批量修改图片大小,批量让其居中,位于单元格内部,这样在解析 xml 时就能对应到每条记录了。
打开 Excel 按下 Alt + F11
,然后点击【插入】菜单,选择【模块】复制下面代码:
1 2 3 4 5 6 7 Sub dq()Dim shp As ShapeFor Each shp In ActiveSheet.Shapesshp.Left = (shp.TopLeftCell.Width - shp.Width) / 2 + shp.TopLeftCell.Left shp.Top = (shp.TopLeftCell.Height - shp.Height) / 2 + shp.TopLeftCell.Top Next End Sub
然后返回 Excel 【开始】菜单中点击【查找和选择】选择【定位条件】中【对象】,将选中所有的图片,然后按下Alt+F8
点击【执行】就可以批量将图片在所在单元格居中了。如果图片大小过大,全部选中后更改图片大小即可。然后解析 Xml 文件就不会出现图片与记录对应不到的情况了。
更多文章请关注微信公众号: 治恒说说