101 lines
		
	
	
		
			3.0 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			101 lines
		
	
	
		
			3.0 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
#!/usr/bin/env python3
 | 
						|
 | 
						|
 | 
						|
"""
 | 
						|
@authors: Indico Development Team (Improved by Sam')
 | 
						|
"""
 | 
						|
 | 
						|
 | 
						|
import os
 | 
						|
 | 
						|
import numpy as np
 | 
						|
 | 
						|
 | 
						|
def mnist(nbTrainings=60000, nbTests=10000, oneHotEncoding=True):
 | 
						|
 | 
						|
    os.system("sh downloadMNIST.sh")
 | 
						|
 | 
						|
    # Set position where the MNIST folder is theoretically created
 | 
						|
    data_dir = os.environ['HOME'] + '/Documents/MNIST/'
 | 
						|
 | 
						|
    # Loading of training images
 | 
						|
    loaded = np.fromfile(
 | 
						|
        file=open(os.path.join(data_dir, 'train-images-idx3-ubyte')),
 | 
						|
        dtype=np.uint8
 | 
						|
    )
 | 
						|
    trainingX = loaded[16:].reshape((60000, 28 * 28)).astype(float)
 | 
						|
 | 
						|
    # Loading of labels bound to training images (loaded just above)
 | 
						|
    loaded = np.fromfile(
 | 
						|
        file=open(os.path.join(data_dir, 'train-labels-idx1-ubyte')),
 | 
						|
        dtype=np.uint8
 | 
						|
    )
 | 
						|
    trainingY = loaded[8:].reshape((60000))
 | 
						|
 | 
						|
    # Loading of test images
 | 
						|
    loaded = np.fromfile(
 | 
						|
        file=open(os.path.join(data_dir, 't10k-images-idx3-ubyte')),
 | 
						|
        dtype=np.uint8
 | 
						|
    )
 | 
						|
    testX = loaded[16:].reshape((10000, 28 * 28)).astype(float)
 | 
						|
 | 
						|
    # Loading of labels bound to test images (loaded just above)
 | 
						|
    loaded = np.fromfile(
 | 
						|
        file=open(os.path.join(data_dir, 't10k-labels-idx1-ubyte')),
 | 
						|
        dtype=np.uint8
 | 
						|
    )
 | 
						|
    testY = loaded[8:].reshape((10000))
 | 
						|
 | 
						|
    # Let's get some values between 0 and 1
 | 
						|
    trainingX = trainingX / 255.
 | 
						|
    testX = testX / 255.
 | 
						|
 | 
						|
    # If the parameter numbers are lower than those set by default...
 | 
						|
    # ... load only this new quantity
 | 
						|
    trainingX = trainingX[:nbTrainings]
 | 
						|
    trainingY = trainingY[:nbTrainings]
 | 
						|
 | 
						|
    testX = testX[:nbTests]
 | 
						|
    testY = testY[:nbTests]
 | 
						|
 | 
						|
    # Little closure to convert the digit labels into one-hot encoding
 | 
						|
    def imagesToOneHot(images, nbDigits):
 | 
						|
 | 
						|
        # We can't use a list with the latest affectation below.
 | 
						|
        # We need an array !
 | 
						|
        if type(images) == list:
 | 
						|
            images = np.array(images)
 | 
						|
 | 
						|
        # Let's collapse the array into only one dimension
 | 
						|
        images = images.flatten()
 | 
						|
 | 
						|
        # New zero matrix, it will represents which digit the input image is...
 | 
						|
        # ... into one-hot encoding
 | 
						|
        imageAsOneHot = np.zeros((len(images), nbDigits))
 | 
						|
 | 
						|
        # Set to '1' one bit on each line
 | 
						|
        imageAsOneHot[np.arange(len(images)), images] = 1
 | 
						|
        """
 | 
						|
        For instance, if the first image is a '2', the second a '4', and...
 | 
						|
        # ... the third a '9', the matrix would become:
 | 
						|
        |0 0 1 0 0 0 0 0 0 0|
 | 
						|
        |0 0 0 0 1 0 0 0 0 0|
 | 
						|
        |0 0 0 0 0 0 0 0 0 1|
 | 
						|
        """
 | 
						|
 | 
						|
        return imageAsOneHot
 | 
						|
 | 
						|
    # If one-hot encoding is set to 'True'...
 | 
						|
    if oneHotEncoding:
 | 
						|
        # ... let's return the training and the test labels encoded...
 | 
						|
        # ... in one-hot binary style (10 is for the digits '0', ..., '9')
 | 
						|
        trainingY = imagesToOneHot(trainingY, 10)
 | 
						|
        testY = imagesToOneHot(testY, 10)
 | 
						|
 | 
						|
    else:
 | 
						|
        # ... if not, training and test labels will be converted as arrays
 | 
						|
        trainingY = np.asarray(trainingY)
 | 
						|
        testY = np.asarray(testY)
 | 
						|
 | 
						|
    return trainingX, testX, trainingY, testY
 |