aboutsummaryrefslogtreecommitdiff
from scipy.io import loadmat
from scipy.misc import imsave
from sklearn.decomposition import PCA

import hashlib
import logging
import numpy as np
import os
import os.path
import sklearn.decomposition
import subprocess
import wget


# Original data
DATA_URL      = "http://isomap.stanford.edu/face_data.mat.Z"
SHA256_DIGEST = "9c5bc75f204071bbd340aa3ff584757ec784b0630206e526d4cd3809f2650a8a"

# Local name
DATA_FNAME = "face_data.mat"

# Output files/directories
IMG_DIR      = "images"
IMG_FNAME    = "face_raw.tbl"
LIGHTS_FNAME = "face_lights.tbl"
POSES_FNAME  = "face_poses.tbl"
PCA_FNAME    = "faces.tbl"


if __name__ == "__main__":
    logging.basicConfig(filename="faces_extract.log",
                        format="%(levelname)s:%(message)s",
                        level=logging.INFO)

    # Get original data
    if not os.path.exists(DATA_FNAME):
        if not os.path.exists("{}.Z".format(DATA_FNAME)):
            logging.info("Downloading faces data from '{}'".format(DATA_URL))
            wget.download(DATA_URL, "{}.Z".format(DATA_FNAME))

        logging.info("Checking SHA-1 digest")
        with open("{}.Z".format(DATA_FNAME), "rb") as f:
            if hashlib.sha256(f.read()).hexdigest() != SHA256_DIGEST:
                logging.error("File seems corrupted; aborting")
                exit(1)

        logging.info("Uncompressing data into '{}'".format(DATA_FNAME))
        subprocess.call(["uncompress", "{}.Z".format(DATA_FNAME)])

    # We have the original data; proceed
    logging.info("Loading faces data")
    faces = loadmat(DATA_FNAME)

    face_images = faces["images"]
    logging.info("Writing image table data to {}".format(IMG_FNAME))
    np.savetxt(IMG_FNAME, face_images.T, fmt="%f")

    if not os.path.exists(IMG_DIR):
        logging.info("Creating directory {}".format(IMG_DIR))
        os.makedirs(IMG_DIR, 0o755)
    elif not os.path.isdir(IMG_DIR):
        logging.error("File {} exists; aborting".format(IMG_DIR))
        exit(1)

    logging.info("Writing image files to {}".format(IMG_DIR))
    for i in range(face_images.shape[1]):
        image = face_images[:, i]
        image = image.reshape(64, 64).T
        path = os.path.join(IMG_DIR, "{}.png".format(i))
        imsave(path, image)

    logging.info("Writing lights data to {}".format(LIGHTS_FNAME))
    np.savetxt(LIGHTS_FNAME, faces["lights"].T, fmt="%f")

    logging.info("Writing poses data to {}".format(POSES_FNAME))
    np.savetxt(POSES_FNAME, faces["poses"].T, fmt="%f")

    logging.info("Writing PCA-whitened data to {}".format(PCA_FNAME))
    X = faces["images"].T
    X = PCA(n_components=256, whiten=True).fit_transform(X)
    np.savetxt(PCA_FNAME, X, fmt="%f")