update code and readme

This commit is contained in:
JiageWang
2019-08-24 21:26:06 +08:00
parent bff40d9966
commit 90ef0ab210
5 changed files with 238 additions and 144 deletions

View File

@@ -1,66 +1,75 @@
import os
import numpy as np
import struct
import pickle
import threading
import numpy as np
from PIL import Image
data_dir = './data'
# train_data_dir = "../data/HWDB1.1trn_gnt"
train_data_dir = os.path.join(data_dir, 'HWDB1.1trn_gnt')
test_data_dir = os.path.join(data_dir, 'HWDB1.1tst_gnt')
def read_from_gnt_dir(gnt_dir=test_data_dir):
# 处理单个gnt文件获取图像与标签
def read_from_gnt_dir(gnt_dir):
def one_file(f):
header_size = 10
while True:
header = np.fromfile(f, dtype='uint8', count=header_size)
if not header.size: break
sample_size = header[0] + (header[1]<<8) + (header[2]<<16) + (header[3]<<24)
tagcode = header[5] + (header[4]<<8)
width = header[6] + (header[7]<<8)
height = header[8] + (header[9]<<8)
if header_size + width*height != sample_size:
if not header.size:
break
image = np.fromfile(f, dtype='uint8', count=width*height).reshape((height, width))
yield image, tagcode
sample_size = header[0] + (header[1] << 8) + (header[2] << 16) + (header[3] << 24)
label = header[5] + (header[4] << 8)
width = header[6] + (header[7] << 8)
height = header[8] + (header[9] << 8)
if header_size + width * height != sample_size:
break
image = np.fromfile(f, dtype='uint8', count=width * height).reshape((height, width))
yield image, label
for file_name in os.listdir(gnt_dir):
if file_name.endswith('.gnt'):
file_path = os.path.join(gnt_dir, file_name)
with open(file_path, 'rb') as f:
for image, tagcode in one_file(f):
yield image, tagcode
for image, label in one_file(f):
yield image, label
def gnt_to_img(gnt_dir, img_dir):
counter = 0
for image, label in read_from_gnt_dir(gnt_dir=gnt_dir):
label = struct.pack('>H', label).decode('gb2312')
img = Image.fromarray(image)
dir_name = os.path.join(img_dir, '%0.5d' % char_dict[label])
if not os.path.exists(dir_name):
os.mkdir(dir_name)
img.convert('RGB').save(dir_name + '/' + str(counter) + '.png')
print("train_counter=", counter)
counter += 1
# 路径
data_dir = './data'
train_gnt_dir = os.path.join(data_dir, 'HWDB1.1trn_gnt')
test_gnt_dir = os.path.join(data_dir, 'HWDB1.1tst_gnt')
train_img_dir = os.path.join(data_dir, 'train')
test_img_dir = os.path.join(data_dir, 'test')
if not os.path.exists(train_img_dir):
os.mkdir(train_img_dir)
if not os.path.exists(test_img_dir):
os.mkdir(test_img_dir)
# 获取字符集合
char_set = set()
for _, tagcode in read_from_gnt_dir(gnt_dir=test_data_dir):
for _, tagcode in read_from_gnt_dir(gnt_dir=test_gnt_dir):
tagcode_unicode = struct.pack('>H', tagcode).decode('gb2312')
char_set.add(tagcode_unicode)
char_list = list(char_set)
char_dict = dict(zip(sorted(char_list), range(len(char_list))))
print(len(char_dict))
print("char_dict=",char_dict)
import pickle
f = open('char_dict', 'wb')
pickle.dump(char_dict, f)
f.close()
train_counter = 0
test_counter = 0
for image, tagcode in read_from_gnt_dir(gnt_dir=train_data_dir):
tagcode_unicode = struct.pack('>H', tagcode).decode('gb2312')
im = Image.fromarray(image)
dir_name = './data/train/' + '%0.5d'%char_dict[tagcode_unicode]
if not os.path.exists(dir_name):
os.mkdir(dir_name)
im.convert('RGB').save(dir_name+'/' + str(train_counter) + '.png')
print("train_counter=",train_counter)
train_counter += 1
# for image, tagcode in read_from_gnt_dir(gnt_dir=test_data_dir):
# tagcode_unicode = struct.pack('>H', tagcode).decode('gb2312')
# im = Image.fromarray(image)
# dir_name = './data/test/' + '%0.5d'%char_dict[tagcode_unicode]
# if not os.path.exists(dir_name):
# os.mkdir(dir_name)
# im.convert('RGB').save(dir_name+'/' + str(test_counter) + '.png')
# print("test_counter=",test_counter)
# test_counter += 1
print("char_dict=", char_dict)
with open('char_dict', 'wb') as f:
pickle.dump(char_dict, f)
train_thread = threading.Thread(target=gnt_to_img, args=(train_gnt_dir, train_img_dir)).start()
test_thread = threading.Thread(target=gnt_to_img, args=(test_gnt_dir, test_img_dir)).start()
train_thread.join()
test_thread.join()