1、需求    有这样一个需求,将采集在 Excel 中的人员信息(包含照片)导入到 Mysql 库。需求很简单,就是读取 Excel 中的数据插入到 Mysql 中的表,问题在于照片怎么读取?
2、分析    对于读取文本数据,直接按行读取即可;对于图片,常用做法是,将 Excel 文件的后缀名改为 zip,然后解压文件,对应文件名目录下有一个\xl\media的目录,里面便是我们要照片的图片,然而,他的文件名、文件顺序都是乱的,确定不出图片对应的记录,怎么办呢?其实目录中还有这样一个 xml 文件\xl\drawings\drawing1.xml其中就有图片与记录的对应关系,只要解析 xml 文件就可以了。知道了这个思路,那我们来一步步的实现他。
3、实现 1、复制并修改指定目录下的文件类型名,将excel后缀名修改为.zip
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 def  copy_change_file_name (file_path, new_type='.zip' ) :    if  not  isfile_exist(file_path):         return  ''      extend = os.path.splitext(file_path)[1 ]       if  extend != '.xlsx'  and  extend != '.xls' :         print("It's not a excel file! %s"  % file_path)         return  False      file_name = os.path.basename(file_path)       new_name = str(file_name.split('.' )[0 ]) + new_type       dir_path = os.path.dirname(file_path)       new_path = os.path.join(dir_path, new_name)       if  os.path.exists(new_path):         os.remove(new_path)     shutil.copyfile(file_path, new_path)     return  new_path   
 
2、解压文件
1 2 3 4 5 6 7 8 9 10 11 12 13 def  unzip_file (zipfile_path) :    if  not  isfile_exist(zipfile_path):         return  False      if  os.path.splitext(zipfile_path)[1 ] != '.zip' :         print("It's not a zip file! %s"  % zipfile_path)         return  False      file_zip = zipfile.ZipFile(zipfile_path, 'r' )     file_name = os.path.basename(zipfile_path)       zipdir = os.path.join(os.path.dirname(zipfile_path), str(file_name.split('.' )[0 ]))       for  files in  file_zip.namelist():         file_zip.extract(files, os.path.join(zipfile_path, zipdir))       file_zip.close()     return  True  
 
3、读取解压后的文件夹,打印图片路径
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 def  read_img (zipfile_path) :    img_dict = dict()     if  not  isfile_exist(zipfile_path):         return  False      dir_path = os.path.dirname(zipfile_path)       file_name = os.path.basename(zipfile_path)       pic_dir = 'xl'  + os.sep + 'media'        pic_path = os.path.join(dir_path, str(file_name.split('.' )[0 ]), pic_dir)     file_list = os.listdir(pic_path)     for  file in  file_list:         filepath = os.path.join(pic_path, file)         print(filepath)         img_index = int(re.findall(r'image(\d+)\.' , filepath)[0 ])         img_dict[img_index] = dict(img_index=img_index, img_path=filepath)     return  img_dict 
 
4、解析xml 文件,获取图片在excel表格中的索引位置信息
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 def  get_img_pos_info (zip_file_path, img_dict, img_feature) :    """解析xml 文件,获取图片在excel表格中的索引位置信息"""      os.path.dirname(zip_file_path)     dir_path = os.path.dirname(zip_file_path)       file_name = os.path.basename(zip_file_path)       xml_dir = 'xl'  + os.sep + 'drawings'  + os.sep + 'drawing1.xml'      xml_path = os.path.join(dir_path, str(file_name.split('.' )[0 ]), xml_dir)     image_info_dict = parse_xml(xml_path, img_dict, img_feature=img_feature)       return  image_info_dict def  get_img_info (excel_file_path, img_feature) :    if  img_feature not  in  ["img_index" , "img_path" , "img_base64" ]:         raise  Exception('图片返回参数错误, ["img_index", "img_path", "img_base64"]' )     zip_file_path = copy_change_file_name(excel_file_path)     if  zip_file_path != '' :         if  unzip_file(zip_file_path):             img_dict = read_img(zip_file_path)             image_info_dict = get_img_pos_info(zip_file_path, img_dict, img_feature)             return  image_info_dict     return  dict() def  parse_xml (file_name, img_dict, img_feature='img_path' ) :         image_info = dict()     dom_obj = xmldom.parse(file_name)          element = dom_obj.documentElement     def  _f (subElementObj) :         for  anchor in  subElementObj:             xdr_from = anchor.getElementsByTagName('xdr:from' )[0 ]             col = xdr_from.childNodes[0 ].firstChild.data               row = xdr_from.childNodes[2 ].firstChild.data             embed = \             anchor.getElementsByTagName('xdr:pic' )[0 ].getElementsByTagName('xdr:blipFill' )[0 ].getElementsByTagName('a:blip' )[0 ].getAttribute('r:embed' )               image_info[(int(row), int(col))] = img_dict.get(int(embed.replace('rId' , '' )), {}).get(img_feature)     sub_twoCellAnchor = element.getElementsByTagName("xdr:twoCellAnchor" )     sub_oneCellAnchor = element.getElementsByTagName("xdr:oneCellAnchor" )     _f(sub_twoCellAnchor)     _f(sub_oneCellAnchor)     return  image_info 
 
5、读取包含图片的excel数据
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 def  read_excel_info (file_path, img_col_index, img_feature='img_path' ) :    img_info_dict = get_img_info(file_path, img_feature)     book = xlrd.open_workbook(file_path)     sheet = book.sheet_by_index(0 )     head = dict()     for  i, v in  enumerate(sheet.row(0 )):         head[i] = v.value     info_list = []     for  row_num in  range(sheet.nrows):         d = dict()         for  col_num in  range(sheet.ncols):             if  row_num == 0 :                 continue              if  'empty:'  in  sheet.cell(row_num, col_num).__str__():                 if  col_num in  img_col_index:                     d[head[col_num]] = img_info_dict.get((row_num, col_num))                 else :                     d[head[col_num]] = sheet.cell(row_num, col_num).value             else :                 d[head[col_num]] = sheet.cell(row_num, col_num).value         if  d:             info_list.append(d)     return  info_list 
 
1 2 3 4 5 6 7 8 9 10 11 import  reimport  xml.dom.minidom as  xmldomimport  osimport  zipfileimport  shutilimport  xlrdif  __name__ == '__main__' :  path = r'E:\\员工信息.xlsx'    data = read_excel_info(path, img_col_index=[4 ])   print(data) 
 
完整代码:https://github.com/dddreams/read-excel-image/blob/master/read_users.py 
4、处理图片对应不到的问题    有时会出现图片对应不到记录的情况,这是因为 Excel 中图片不在单独的单元格内,指定的列中取不到图片,这就需要调整 Excel 内的数据了,但是如果数据量比较大,调整起来就比较麻烦了。其实,我们可以借助 Excel 宏的功能,批量修改图片大小,批量让其居中,位于单元格内部,这样在解析 xml 时就能对应到每条记录了。
打开 Excel 按下 Alt + F11 ,然后点击【插入】菜单,选择【模块】复制下面代码:
1 2 3 4 5 6 7 Sub  dq()Dim  shp As ShapeFor  Each  shp In  ActiveSheet.Shapesshp.Left  = (shp.TopLeftCell.Width - shp.Width) / 2  + shp.TopLeftCell.Left  shp.Top = (shp.TopLeftCell.Height - shp.Height) / 2  + shp.TopLeftCell.Top Next End  Sub 
 
然后返回 Excel 【开始】菜单中点击【查找和选择】选择【定位条件】中【对象】,将选中所有的图片,然后按下Alt+F8 点击【执行】就可以批量将图片在所在单元格居中了。如果图片大小过大,全部选中后更改图片大小即可。然后解析 Xml 文件就不会出现图片与记录对应不到的情况了。
更多文章请关注微信公众号: 治恒说说