This repository has been archived on 2023-11-03. You can view files and clone it, but cannot push or open issues or pull requests.
MINDLE/dev_null/MNIST/loadMNIST.py

101 lines
3.0 KiB
Python
Executable File

#!/usr/bin/env python3
"""
@authors: Indico Development Team (Improved by Sam')
"""
import os
import numpy as np
def mnist(nbTrainings=60000, nbTests=10000, oneHotEncoding=True):
os.system("sh downloadMNIST.sh")
# Set position where the MNIST folder is theoretically created
data_dir = os.environ['HOME'] + '/Documents/MNIST/'
# Loading of training images
loaded = np.fromfile(
file=open(os.path.join(data_dir, 'train-images-idx3-ubyte')),
dtype=np.uint8
)
trainingX = loaded[16:].reshape((60000, 28 * 28)).astype(float)
# Loading of labels bound to training images (loaded just above)
loaded = np.fromfile(
file=open(os.path.join(data_dir, 'train-labels-idx1-ubyte')),
dtype=np.uint8
)
trainingY = loaded[8:].reshape((60000))
# Loading of test images
loaded = np.fromfile(
file=open(os.path.join(data_dir, 't10k-images-idx3-ubyte')),
dtype=np.uint8
)
testX = loaded[16:].reshape((10000, 28 * 28)).astype(float)
# Loading of labels bound to test images (loaded just above)
loaded = np.fromfile(
file=open(os.path.join(data_dir, 't10k-labels-idx1-ubyte')),
dtype=np.uint8
)
testY = loaded[8:].reshape((10000))
# Let's get some values between 0 and 1
trainingX = trainingX / 255.
testX = testX / 255.
# If the parameter numbers are lower than those set by default...
# ... load only this new quantity
trainingX = trainingX[:nbTrainings]
trainingY = trainingY[:nbTrainings]
testX = testX[:nbTests]
testY = testY[:nbTests]
# Little closure to convert the digit labels into one-hot encoding
def imagesToOneHot(images, nbDigits):
# We can't use a list with the latest affectation below.
# We need an array !
if type(images) == list:
images = np.array(images)
# Let's collapse the array into only one dimension
images = images.flatten()
# New zero matrix, it will represents which digit the input image is...
# ... into one-hot encoding
imageAsOneHot = np.zeros((len(images), nbDigits))
# Set to '1' one bit on each line
imageAsOneHot[np.arange(len(images)), images] = 1
"""
For instance, if the first image is a '2', the second a '4', and...
# ... the third a '9', the matrix would become:
|0 0 1 0 0 0 0 0 0 0|
|0 0 0 0 1 0 0 0 0 0|
|0 0 0 0 0 0 0 0 0 1|
"""
return imageAsOneHot
# If one-hot encoding is set to 'True'...
if oneHotEncoding:
# ... let's return the training and the test labels encoded...
# ... in one-hot binary style (10 is for the digits '0', ..., '9')
trainingY = imagesToOneHot(trainingY, 10)
testY = imagesToOneHot(testY, 10)
else:
# ... if not, training and test labels will be converted as arrays
trainingY = np.asarray(trainingY)
testY = np.asarray(testY)
return trainingX, testX, trainingY, testY