带线表格据gt生成无线表格

json解析练习,python图像处理练习,表格包围框毛刺问题待解决。

  1 # -*- coding: utf-8 -*-
  2 # coding: utf-8
  3 from PIL import Image, ImageDraw
  4 import cv2
  5 import os
  6 import csv
  7 import json
  8 color = {
  9     0: [255, 0, 0],
 10     1: [0, 255, 0],
 11     2: [0, 0, 255]
 12 }
 13 from PIL import Image
 14 import numpy as np
 15 
 16 # 取批量点取RGB众数,来推算背景色值
 17 def publicnum(num, d=0):
 18     dictnum = {}
 19     for i in range(len(num)):
 20         if str(num[i]) in dictnum.keys():
 21             dictnum[str(num[i])] += 1
 22         else:
 23             dictnum.setdefault(str(num[i]), 1)
 24     maxnum = 0
 25     maxkey = '[255 255 255]'
 26     for k, v in dictnum.items():
 27         if v >= maxnum:
 28             maxnum = v
 29             maxkey = k
 30     return maxkey
 31 
 32 page_img_dir = "JPG"
 33 output_dir = "年报_PDF_TABLE_JPG_eliminate_lines-5-15-final"
 34 
 35 isExists = os.path.exists(output_dir)
 36 if not isExists:
 37     os.makedirs(output_dir)
 38 csv_file = csv.reader(open('表格结构标注-带线年报_UTF-8.csv', 'r', encoding="gbk"))
 39 data_list = []
 40 for data in csv_file:
 41     data_list.append(data)
 42 
 43 print("page number: ", len(data_list) - 1)
 44 
 45 for data in data_list[1:]:  # 跳过第一行
 46     img_path = data[0]
 47     img_name = img_path.split('/')[-1]  # /分割后最后一个为名字
 48     pdf_name = img_name.split('_')[0]  # -分割后 第一个是名字
 49     local_img_path = os.path.join(page_img_dir, img_name)  # 拼接路径
 50     print(local_img_path)
 51     annotation = json.loads(data[2])  # json单元格读取
 52     objects = annotation['objects']  # object是一个列表,读取该列表
 53     cnt = 0
 54     tu = Image.open(local_img_path)
 55     page_img = np.array(tu)
 56     for page_object in objects:
 57         if 'cur' in page_object.keys():
 58             cur = page_object['cur']
 59         else:
 60             cur = cnt
 61         polygon = page_object['polygon']['ptList']
 62         x_list = [p['x'] for p in polygon]
 63         y_list = [p['y'] for p in polygon]
 64         x_min = min(x_list)
 65         x_max = max(x_list)
 66         y_min = min(y_list)
 67         y_max = max(y_list)
 68         if abs(x_max - x_min) < 20:  # 纵向线条
 69             xx = int((x_min + x_max) / 2)
 70             inline_y_list = [y_max+20, y_min]
 71             #寻找相交横线分割点
 72             for in_page_object in objects:
 73                 in_polygon = in_page_object['polygon']['ptList']
 74                 in_x_list = [in_p['x'] for in_p in in_polygon]
 75                 in_y_list = [in_p['y'] for in_p in in_polygon]
 76                 in_x_min = min(in_x_list)
 77                 in_x_max = max(in_x_list)
 78                 in_y_min = min(in_y_list)
 79                 in_y_max = max(in_y_list)
 80                 if in_y_max - in_y_min < 20:  # 判断为横线
 81                     if in_x_max+5 >= xx and in_x_min-5 <= xx:  # 判断相交
 82                         point_y = in_y_min
 83                         inline_y_list.append(point_y)
 84                         if 0<abs(y_max-point_y)< 10:
 85                             try:
 86                                 inline_y_list.remove(max(y_max+20, point_y))
 87                                 inline_y_list.append(min(y_max+20, point_y))
 88                             except:
 89                                 pass
 90                         elif 0 < abs(y_min-point_y) < 10:
 91                             try:
 92                                 inline_y_list.remove(min(y_min, point_y))
 93                                 inline_y_list.append(max(y_min, point_y))
 94                             except:
 95                                 pass
 96             inline_y_list = list({}.fromkeys(inline_y_list).keys())
 97             inline_y_list.sort()
 98             inline_y_list[-1]+=5
 99             if inline_y_list[-1]>2339:
100                 inline_y_list[-1]=2339
101             # 线条分割结束
102             for i in range(0, inline_y_list.__len__()):
103                 if i < inline_y_list.__len__() - 1:
104                     # 开始取样
105                     back_colors = []
106                     for yy in range(inline_y_list[i], inline_y_list[i + 1]):
107                         if xx + 8 < 1654:
108                             back_colors.append(page_img[yy, xx + 8])
109                         else:
110                             back_colors.append(page_img[yy, xx - 8])
111                     back_color = publicnum(back_colors)
112                     back_color = back_color[1:-1]
113                     try:
114                         back_color = back_color.split(' ')
115                     except:
116                         back_color = back_color.split('   ')
117                         print(type(back_color))
118                     if len(back_color) > 3:
119                         back_color = list(filter(None, back_color))
120                     # 取样结束
121                     # 纵向填色
122 
123                     for yy in range(inline_y_list[i]-4, inline_y_list[i + 1]-4):
124                         if y_min-20<inline_y_list[i]<y_max+20 :
125                             for ranging in range(-4, x_max-x_min+5):
126                                 if x_min+ranging < 1654 and x_min+ranging >= 0:
127                                         page_img[yy, x_min + ranging] = back_color
128                                 else:
129                                     pass
130                         else:
131                             pass
132         elif abs(y_max - y_min) < 20:  # 横向线条
133             yy = int((y_min + y_max) / 2)
134             inline_x_list = [x_max+20, x_min]
135             # 寻找相交横线分割点
136             for in_page_object2 in objects:
137                 polygon2 = in_page_object2['polygon']['ptList']
138                 in_x_list2 = [p['x'] for p in polygon2]
139                 in_y_list2 = [p['y'] for p in polygon2]
140                 in_x_min = min(in_x_list2)
141                 in_x_max = max(in_x_list2)
142                 in_y_min = min(in_y_list2)
143                 in_y_max = max(in_y_list2)
144                 if abs(in_x_max - in_x_min) < 20:  # 判断为纵线
145                     if in_y_max+5 >= y_min and in_y_min-5 <= y_max:  # 判断相交
146                         point_x = in_x_min
147                         inline_x_list.append(point_x)
148                         if 0<abs(x_max-point_x)<10:
149                             try:
150                                 inline_x_list.remove(max(x_max+20, point_x))
151                                 inline_x_list.append(min(x_max+20, point_x))
152                             except:
153                                 pass
154                         elif 0<abs(x_min-point_x)<10:
155                             try:
156                                 inline_x_list.remove(min(x_min, point_x))
157                                 inline_x_list.append(max(x_min, point_x))
158                             except:
159                                 pass
160                         else:
161                             pass
162             inline_x_list = list({}.fromkeys(inline_x_list).keys())
163             inline_x_list.sort()
164             #inline_x_list[-1]+=5
165             # 线条分割结束
166             for i in range(0, inline_x_list.__len__()):
167                 if i < inline_x_list.__len__() - 1:
168                     # 开始取样
169                     back_colors = []
170                     for xx in range(inline_x_list[i], inline_x_list[i + 1]):
171                         if yy+8 < 2339:
172                             back_colors.append(page_img[yy + 8, xx])
173                         else:
174                             back_colors.append(page_img[yy - 8, xx])
175                     back_color = publicnum(back_colors)
176                     back_color = back_color[1:-1]
177                     try:
178                         back_color = back_color.split(' ')
179                     except:
180                         back_color = back_color.split('   ')
181 
182                     if len(back_color) > 3:
183                         back_color = list(filter(None, back_color))
184                     # 取样结束
185                     # 横线填色
186                     for xx in range(inline_x_list[i]-4, inline_x_list[i + 1]-4):
187                         if x_min-20<inline_x_list[i]<x_max+20:
188                             for ranging in range(-4, y_max-y_min+5):
189                                 if y_min+ranging < 2339 and y_min+ranging >= 0:
190                                     page_img[y_min+ranging, xx] = back_color
191                                 elif y_min+ranging>=2339:
192                                     page_img[2338, xx] = back_color
193                                 else:
194                                     page_img[0, xx] = back_color
195                         else:
196                             pass
197         else:
198             print("no such line", 'x_min:', x_min,'x_max:', x_max, 'y_max:', y_max, 'y_min:', y_min)
199     tu = Image.fromarray(page_img.astype('uint8'))
200     output_path = os.path.join(output_dir, img_name.split('.')[0] + '_' + str(cur) + ".jpg")
201     tu.save(output_path)
202     cv2.imwrite(output_path, page_img)
203     cnt += 1
原文地址:https://www.cnblogs.com/wind-chaser/p/10868935.html