Initial commit

This commit is contained in:
lichuang
2019-05-06 17:43:25 +08:00
parent 2b1bd47f91
commit 212d0c6dcb
30 changed files with 319144 additions and 2 deletions

104
dataset/dataset_paris.py Normal file
View File

@@ -0,0 +1,104 @@
"""
@file: dataset_paris.py
@time: 2018/7/31 15:03
@desc:Create the input data pipeline using `tf.data`
"""
import numpy as np
import tensorflow as tf
image_width = None
image_height = None
images_dir = None
channels = 1
def _read_image(filename, is_augment):
image_string = tf.read_file(tf.string_join([images_dir, filename]))
image_decoded = tf.image.decode_png(image_string, channels=channels)
true_constant = tf.constant(1, dtype=tf.int32, name="true_constant")
image_decoded = tf.cond(tf.equal(true_constant, is_augment),
lambda: tf.image.flip_left_right(image_decoded),
lambda: image_decoded)
image_resized = tf.image.resize_images(image_decoded, [image_width, image_height])
return image_resized
def _parse_function(item):
is_aug = tf.string_to_number(item[3], out_type=tf.int32)
image0 = _read_image(item[0], is_aug)
image1 = _read_image(item[1], is_aug)
image = tf.concat([image0, image1], 2)
return image, tf.string_to_number(item[2])
def _input_fn(params, is_training, is_augment=False, pos_repeating=1, only_label=None):
"""Train input function.
Args:
listfile_path: listfile has 3 item per line
params: contains hyperparameters of the model (ex: data_dir, image's width and height.)
"""
listfile_path = params.signature_train_list if is_training else params.signature_val_list
data = []
shuffle_neg = []
size_per_signer = params.positive_size + params.negative_size
file = open(listfile_path)
for i, line in enumerate(file.readlines()):
items = line.split(' ')
file0 = items[0]
file1 = items[1]
label = int(items[2])
if (only_label is not None and label != only_label) or label == 2:
continue
repeating = 1
if is_training and pos_repeating > 0 and i % size_per_signer == 0:
"""the number of positive/negative pairs is 276/996,
so we need to expand positive pairs, or reduce the negative pairs"""
shuffle_neg = np.arange(params.positive_size, size_per_signer)
np.random.shuffle(shuffle_neg)
shuffle_neg = shuffle_neg[:params.positive_size * pos_repeating]
if is_training and pos_repeating > 0:
"""expand positive pairs"""
if label == 2:
repeating = 1 if (i % params.negative_size) > params.positive_size * pos_repeating else 0
repeating = 0
elif label == 0:
"""reduce negative pairs """
repeating = 1 if i % size_per_signer in shuffle_neg else 0
elif label == 1:
repeating = pos_repeating
for j in range(repeating):
"""file0, file1, label, is_augment"""
data.append((file0, file1, label, 0))
if is_augment and is_training:
data.append((file0, file1, label, 1))
# data.append((file1, file0, label))
file.close()
np.random.shuffle(data)
print("examples of data: -> %d" % len(data))
dataset = tf.data.Dataset.from_tensor_slices(np.array(data))
dataset = dataset.map(_parse_function, num_parallel_calls=params.num_parallel_calls)
dataset = dataset.shuffle(10000)
dataset = dataset.repeat(params.num_epochs)
dataset = dataset.apply(tf.contrib.data.batch_and_drop_remainder(
params.batch_size * params.num_gpus))
dataset = dataset.prefetch(10)
return dataset
def input_fn(params, is_training, repeating=1, is_augment=False, only_label=None):
global image_width, image_height, images_dir, channels
image_width = params.image_width
image_height = params.image_height
images_dir = params.images_dir
channels = params.channels
return _input_fn(params, is_training, pos_repeating=repeating, is_augment=is_augment, only_label=only_label)

View File

@@ -0,0 +1,111 @@
"""
@file: dataset_bhsig260.py
@time: 2018/6/20 15:03
@desc:Create the paris list of BHSig260 Database
"""
import copy
import os
import sys
import imageio
import numpy as np
num_genuine = 24
num_forged = 30
# 生成数组l的全部组合长度k
def combine(l, k):
answers = []
one = [0] * k
def next_c(li=0, ni=0):
if ni == k:
answers.append(copy.copy(one))
return
for lj in range(li, len(l)):
one[ni] = l[lj]
next_c(lj + 1, ni + 1)
next_c()
return answers
# 生成两个数组间的全部组合
def combine_2list(list1, list2):
answers = []
for i1 in list1:
for i2 in list2:
answers.append([i1, i2])
return answers
def generate_list(data_dir, train_size, filename_pre, listfile_name):
root_dir = os.path.basename(data_dir)
signers_list = os.listdir(data_dir)
list_file_train = open(listfile_name + '_train.txt', 'w')
list_file_test = open(listfile_name + '_val.txt', 'w')
train_indexs = np.arange(0, len(signers_list), 1)
np.random.shuffle(train_indexs)
train_indexs = train_indexs[:train_size]
for i, signer in enumerate(signers_list):
list_file = list_file_train if i in train_indexs else list_file_test
genuine_genuine_suf = combine(list(range(1, num_genuine + 1)), 2)
for item in genuine_genuine_suf:
genuine0 = "%s/%s/%s-%d-G-%02d%s" % (root_dir, signer, filename_pre, int(signer), item[0], '.jpg')
genuine1 = "%s/%s/%s-%d-G-%02d%s" % (root_dir, signer, filename_pre, int(signer), item[1], '.jpg')
line = genuine0 + ' ' + genuine1 + ' 1\n'
list_file.write(line)
genuine_forged_suf = combine_2list(list(range(1, num_genuine + 1)), list(range(1, num_forged + 1)))
for item in genuine_forged_suf:
genuine = "%s/%s/%s-%d-G-%02d%s" % (root_dir, signer, filename_pre, int(signer), item[0], '.jpg')
forged = "%s/%s/%s-%d-F-%02d%s" % (root_dir, signer, filename_pre, int(signer), item[1], '.jpg')
line = genuine + ' ' + forged + ' 0\n'
list_file.write(line)
list_file_train.close()
list_file_test.close()
def rename(dir_path):
for root, dirs, files in os.walk(dir_path):
for file in files:
if not file.endswith('.jpg'):
continue
new_filename = file.replace('-S-00', '-S-')
new_filename = new_filename.replace('-S-0', '-S-')
os.rename(os.path.join(root, file), os.path.join(root, new_filename))
def tif_to_jpg(tif_dir, jpg_dir):
for root, dirs, files in os.walk(tif_dir):
to_dir = root.replace(tif_dir, jpg_dir)
if not os.path.exists(to_dir):
os.mkdir(to_dir)
for file in files:
if not file.endswith('.tif'):
continue
image = imageio.imread(os.path.join(root, file))
jpg_file = file.replace('.tif', '.jpg')
imageio.imwrite(os.path.join(to_dir, jpg_file), image)
def main(argv=None):
if argv is None:
argv = sys.argv
rename('/home/deeplearning/work/Deeplearning/dataset/writingID/offline/BHSig260_jpgs/')
# generate_list('/home/deeplearning/work/Deeplearning/dataset/writingID/offline/BHSig260_jpgs/Hindi', 100, 'H-S',
# '../experiments/data_list/bhsig260_Hindi')
# generate_list('/home/deeplearning/work/Deeplearning/dataset/writingID/offline/BHSig260_jpgs/Bengali', 50,
# 'B-S',
# '../experiments/data_list/bhsig260_Bengali')
if __name__ == "__main__":
sys.exit(main())

View File

@@ -0,0 +1,96 @@
"""
@file: generate_list_cedar.py
@time: 2018/6/20 15:03
@desc:Create the paris list of CEDAR Database
"""
import copy
import os
import sys
import imageio
import numpy as np
num_genuine = 24
num_forged = 24
# 生成数组l的全部组合长度k
def combine(l, k):
answers = []
one = [0] * k
def next_c(li=0, ni=0):
if ni == k:
answers.append(copy.copy(one))
return
for lj in range(li, len(l)):
one[ni] = l[lj]
next_c(lj + 1, ni + 1)
next_c()
return answers
# 生成两个数组间的全部组合
def combine_2list(list1, list2):
answers = []
for i1 in list1:
for i2 in list2:
answers.append([i1, i2])
return answers
def generate_list(train_size, listfile_name):
signers_list = list(range(1, 56))
list_file_train = open(listfile_name + '_train.txt', 'w')
list_file_test = open(listfile_name + '_val.txt', 'w')
train_indexs = np.arange(0, len(signers_list), 1)
np.random.shuffle(train_indexs)
train_indexs = train_indexs[:train_size]
for i, signer in enumerate(signers_list):
list_file = list_file_train if i in train_indexs else list_file_test
genuine_genuine_suf = combine(list(range(1, num_genuine + 1)), 2)
for item in genuine_genuine_suf:
genuine0 = "%s%d_%d%s" % ('full_org/original_', int(signer), item[0], '.png')
genuine1 = "%s%d_%d%s" % ('full_org/original_', int(signer), item[1], '.png')
line = genuine0 + ' ' + genuine1 + ' 1\n'
list_file.write(line)
genuine_forged_suf = combine_2list(list(range(1, num_genuine + 1)), list(range(1, num_forged + 1)))
for item in genuine_forged_suf:
genuine = "%s%d_%d%s" % ('full_org/original_', int(signer), item[0], '.png')
forged = "%s%d_%d%s" % ('full_forg/forgeries_', int(signer), item[1], '.png')
line = genuine + ' ' + forged + ' 0\n'
list_file.write(line)
list_file_train.close()
list_file_test.close()
def tif_to_jpg(tif_dir, jpg_dir):
for root, dirs, files in os.walk(tif_dir):
to_dir = root.replace(tif_dir, jpg_dir)
if not os.path.exists(to_dir):
os.mkdir(to_dir)
for file in files:
if not file.endswith('.tif'):
continue
image = imageio.imread(os.path.join(root, file))
jpg_file = file.replace('.tif', '.jpg')
imageio.imwrite(os.path.join(to_dir, jpg_file), image)
def main(argv=None):
if argv is None:
argv = sys.argv
generate_list(50, '../experiments/data_list/cedar')
# generate_list(100, '../experiments/data_list/bhsig260_Hindi')
if __name__ == "__main__":
sys.exit(main())

View File

@@ -0,0 +1,97 @@
"""
@file: model.py
@time: 2018/4/17 15:03
@desc: Generate the list of data pairs
"""
import copy
import os
import sys
import numpy as np
image_dir = '/home/deeplearning/work/Deeplearning/dataset/writingID/offline/firmas/'
list_filename_train = '../experiments/data_list/firmas_pairs_c_train.txt'
list_filename_test = '../experiments/data_list/firmas_pairs_c_val.txt'
num_genuine = 24
num_forged = 30
# 生成数组l的全部组合长度k
def combine(l, k):
answers = []
one = [0] * k
def next_c(li=0, ni=0):
if ni == k:
answers.append(copy.copy(one))
return
for lj in range(li, len(l)):
one[ni] = l[lj]
next_c(lj + 1, ni + 1)
next_c()
return answers
# 生成两个数组间的全部组合
def combine_2list(list1, list2):
answers = []
for i1 in list1:
for i2 in list2:
answers.append([i1, i2])
return answers
def main(argv=None):
if argv is None:
argv = sys.argv
signers_list = os.listdir(image_dir)
list_file_train = open(list_filename_train, 'w')
list_file_test = open(list_filename_test, 'w')
for signer in signers_list:
list_file = list_file_train if int(signer) <= 3500 else list_file_test
genuine_genuine_suf = combine(list(range(1, num_genuine + 1)), 2)
for item in genuine_genuine_suf:
genuine0 = signer + '/c-' + signer + "-%02d" % (item[0]) + '.jpg'
genuine1 = signer + '/c-' + signer + "-%02d" % (item[1]) + '.jpg'
line = genuine0 + ' ' + genuine1 + ' 1\n'
list_file.write(line)
genuine_forged_suf = combine_2list(list(range(1, num_genuine + 1)), list(range(1, num_forged + 1)))
for item in genuine_forged_suf:
genuine = signer + '/c-' + signer + "-%02d" % (item[0]) + '.jpg'
forged = signer + '/cf-' + signer + "-%02d" % (item[1]) + '.jpg'
line = genuine + ' ' + forged + ' 0\n'
list_file.write(line)
"""随机伪造情况每个writer 和其他writer组合"""
random_forged_nums = 2880000
# random_forged_val_nums = 2880000 * 0.15
writers = np.arange(1, 4001, 1)
writers = np.split(writers, 2)
writers_part1 = writers[0]
writers_part2 = writers[1]
genuine_forged_suf = combine_2list(writers_part1, writers_part2)
np.random.shuffle(genuine_forged_suf)
i = 0
for item in genuine_forged_suf:
if i > random_forged_nums:
break
i += 1
list_file = list_file_train if i % 6 != 0 else list_file_test
genuine = '%03d' % item[0] + '/c-' + '%03d' % item[0] + "-09" + '.jpg'
forged = '%03d' % item[1] + '/c-' + '%03d' % item[1] + "-09" + '.jpg'
line = genuine + ' ' + forged + ' 2\n'
list_file.write(line)
list_file_train.close()
list_file_test.close()
if __name__ == "__main__":
sys.exit(main())

27
dataset/params.json Normal file
View File

@@ -0,0 +1,27 @@
{
"model": "Inception_2logits",
"signature_train_list": "./experiments/data_list/firmas_pairs_c_train.txt",
"signature_val_list": "./experiments/data_list/firmas_pairs_c_val.txt",
"images_dir": "/home/deeplearning/work/Deeplearning/dataset/writingID/offline/firmas_binarized/",
"is_augment": false,
"learning_rate": 1e-5,
"batch_size": 32,
"num_epochs": 1,
"use_batch_norm": true,
"bn_momentum": 0.9,
"margin": 5,
"embedding_size": 64,
"keep_prob": 0.4,
"squared": false,
"image_width": 220,
"image_height": 155,
"positive_size": 276,
"negative_size": 720,
"channels": 1,
"num_parallel_calls": 4,
"save_summary_steps": 100,
"save_checkpoints_steps": 1000,
"num_gpus": 3,
"keep_checkpoint_max": 25,
"eval_steps": 10
}

View File

@@ -0,0 +1,45 @@
# encoding: utf-8
"""
@author: lichuang
@license: (C) Copyright 2010, CFCA
@file: preprosess_images.py
@time: 2018/5/8 18:
@desc: regularize images, binaries, turn into black background
"""
import os
import sys
import imageio
import numpy as np
dir_to_process = '/home/deeplearning/work/Deeplearning/dataset/writingID/offline/firmas/'
dir_processed = '/home/deeplearning/work/Deeplearning/dataset/writingID/offline/firmas_binarized/'
def _normalize_images(images_dir, processed_dir, reverse):
"""binaries, turn into black background """
for root, dirs, files in os.walk(images_dir):
for name in files:
new_path = os.path.join(processed_dir, os.path.split(root)[-1])
if not os.path.exists(new_path):
os.mkdir(new_path)
if name.lower().endswith('.jpg'):
image = imageio.imread(os.path.join(root, name))
image[np.where(image < 230)] = 0
image[np.where(image >= 230)] = 255
if reverse:
image = 255 - image
imageio.imwrite(os.path.join(new_path, name), image)
print('all images processed!')
def main(argv=None):
if argv is None:
argv = sys.argv
_normalize_images(dir_to_process, dir_processed, False)
if __name__ == "__main__":
sys.exit(main())