101 lines
3.0 KiB
Python
Executable File
101 lines
3.0 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
|
|
"""
|
|
@authors: Indico Development Team (Improved by Sam')
|
|
"""
|
|
|
|
|
|
import os
|
|
|
|
import numpy as np
|
|
|
|
|
|
def mnist(nbTrainings=60000, nbTests=10000, oneHotEncoding=True):
|
|
|
|
os.system("sh downloadMNIST.sh")
|
|
|
|
# Set position where the MNIST folder is theoretically created
|
|
data_dir = os.environ['HOME'] + '/Documents/MNIST/'
|
|
|
|
# Loading of training images
|
|
loaded = np.fromfile(
|
|
file=open(os.path.join(data_dir, 'train-images-idx3-ubyte')),
|
|
dtype=np.uint8
|
|
)
|
|
trainingX = loaded[16:].reshape((60000, 28 * 28)).astype(float)
|
|
|
|
# Loading of labels bound to training images (loaded just above)
|
|
loaded = np.fromfile(
|
|
file=open(os.path.join(data_dir, 'train-labels-idx1-ubyte')),
|
|
dtype=np.uint8
|
|
)
|
|
trainingY = loaded[8:].reshape((60000))
|
|
|
|
# Loading of test images
|
|
loaded = np.fromfile(
|
|
file=open(os.path.join(data_dir, 't10k-images-idx3-ubyte')),
|
|
dtype=np.uint8
|
|
)
|
|
testX = loaded[16:].reshape((10000, 28 * 28)).astype(float)
|
|
|
|
# Loading of labels bound to test images (loaded just above)
|
|
loaded = np.fromfile(
|
|
file=open(os.path.join(data_dir, 't10k-labels-idx1-ubyte')),
|
|
dtype=np.uint8
|
|
)
|
|
testY = loaded[8:].reshape((10000))
|
|
|
|
# Let's get some values between 0 and 1
|
|
trainingX = trainingX / 255.
|
|
testX = testX / 255.
|
|
|
|
# If the parameter numbers are lower than those set by default...
|
|
# ... load only this new quantity
|
|
trainingX = trainingX[:nbTrainings]
|
|
trainingY = trainingY[:nbTrainings]
|
|
|
|
testX = testX[:nbTests]
|
|
testY = testY[:nbTests]
|
|
|
|
# Little closure to convert the digit labels into one-hot encoding
|
|
def imagesToOneHot(images, nbDigits):
|
|
|
|
# We can't use a list with the latest affectation below.
|
|
# We need an array !
|
|
if type(images) == list:
|
|
images = np.array(images)
|
|
|
|
# Let's collapse the array into only one dimension
|
|
images = images.flatten()
|
|
|
|
# New zero matrix, it will represents which digit the input image is...
|
|
# ... into one-hot encoding
|
|
imageAsOneHot = np.zeros((len(images), nbDigits))
|
|
|
|
# Set to '1' one bit on each line
|
|
imageAsOneHot[np.arange(len(images)), images] = 1
|
|
"""
|
|
For instance, if the first image is a '2', the second a '4', and...
|
|
# ... the third a '9', the matrix would become:
|
|
|0 0 1 0 0 0 0 0 0 0|
|
|
|0 0 0 0 1 0 0 0 0 0|
|
|
|0 0 0 0 0 0 0 0 0 1|
|
|
"""
|
|
|
|
return imageAsOneHot
|
|
|
|
# If one-hot encoding is set to 'True'...
|
|
if oneHotEncoding:
|
|
# ... let's return the training and the test labels encoded...
|
|
# ... in one-hot binary style (10 is for the digits '0', ..., '9')
|
|
trainingY = imagesToOneHot(trainingY, 10)
|
|
testY = imagesToOneHot(testY, 10)
|
|
|
|
else:
|
|
# ... if not, training and test labels will be converted as arrays
|
|
trainingY = np.asarray(trainingY)
|
|
testY = np.asarray(testY)
|
|
|
|
return trainingX, testX, trainingY, testY
|