From b255338295587246292dc978e7d4d5687ee01fb4 Mon Sep 17 00:00:00 2001
From: Samuel Fadel <samuelfadel@gmail.com>
Date: Fri, 19 Aug 2016 14:20:57 -0300
Subject: Scripts and other files for building all datasets.

---
 datasets/faces/faces_extract.py | 81 +++++++++++++++++++++++++++++++++++++++++
 datasets/faces/source           |  1 +
 2 files changed, 82 insertions(+)
 create mode 100644 datasets/faces/faces_extract.py
 create mode 100644 datasets/faces/source

(limited to 'datasets/faces')

diff --git a/datasets/faces/faces_extract.py b/datasets/faces/faces_extract.py
new file mode 100644
index 0000000..3e8b4f3
--- /dev/null
+++ b/datasets/faces/faces_extract.py
@@ -0,0 +1,81 @@
+from scipy.io import loadmat
+from scipy.misc import imsave
+from sklearn.decomposition import PCA
+
+import hashlib
+import logging
+import numpy as np
+import os
+import os.path
+import sklearn.decomposition
+import subprocess
+import wget
+
+
+# Original data
+DATA_URL      = "http://isomap.stanford.edu/face_data.mat.Z"
+SHA256_DIGEST = "9c5bc75f204071bbd340aa3ff584757ec784b0630206e526d4cd3809f2650a8a"
+
+# Local name
+DATA_FNAME = "face_data.mat"
+
+# Output files/directories
+IMG_DIR      = "images"
+IMG_FNAME    = "face_raw.tbl"
+LIGHTS_FNAME = "face_lights.tbl"
+POSES_FNAME  = "face_poses.tbl"
+PCA_FNAME    = "faces.tbl"
+
+
+if __name__ == "__main__":
+    logging.basicConfig(filename="faces_extract.log",
+                        format="%(levelname)s:%(message)s",
+                        level=logging.INFO)
+
+    # Get original data
+    if not os.path.exists(DATA_FNAME):
+        if not os.path.exists("{}.Z".format(DATA_FNAME)):
+            logging.info("Downloading faces data from '{}'".format(DATA_URL))
+            wget.download(DATA_URL, "{}.Z".format(DATA_FNAME))
+
+        logging.info("Checking SHA-1 digest")
+        with open("{}.Z".format(DATA_FNAME), "rb") as f:
+            if hashlib.sha256(f.read()).hexdigest() != SHA256_DIGEST:
+                logging.error("File seems corrupted; aborting")
+                exit(1)
+
+        logging.info("Uncompressing data into '{}'".format(DATA_FNAME))
+        subprocess.call(["uncompress", "{}.Z".format(DATA_FNAME)])
+
+    # We have the original data; proceed
+    logging.info("Loading faces data")
+    faces = loadmat(DATA_FNAME)
+
+    face_images = faces["images"]
+    logging.info("Writing image table data to {}".format(IMG_FNAME))
+    np.savetxt(IMG_FNAME, face_images.T, fmt="%f")
+
+    if not os.path.exists(IMG_DIR):
+        logging.info("Creating directory {}".format(IMG_DIR))
+        os.makedirs(IMG_DIR, 0o755)
+    elif not os.path.isdir(IMG_DIR):
+        logging.error("File {} exists; aborting".format(IMG_DIR))
+        exit(1)
+
+    logging.info("Writing image files to {}".format(IMG_DIR))
+    for i in range(face_images.shape[1]):
+        image = face_images[:, i]
+        image = image.reshape(64, 64).T
+        path = os.path.join(IMG_DIR, "{}.png".format(i))
+        imsave(path, image)
+
+    logging.info("Writing lights data to {}".format(LIGHTS_FNAME))
+    np.savetxt(LIGHTS_FNAME, faces["lights"].T, fmt="%f")
+
+    logging.info("Writing poses data to {}".format(POSES_FNAME))
+    np.savetxt(POSES_FNAME, faces["poses"].T, fmt="%f")
+
+    logging.info("Writing PCA-whitened data to {}".format(PCA_FNAME))
+    X = faces["images"].T
+    X = PCA(n_components=256, whiten=True).fit_transform(X)
+    np.savetxt(PCA_FNAME, X, fmt="%f")
diff --git a/datasets/faces/source b/datasets/faces/source
new file mode 100644
index 0000000..e89da9b
--- /dev/null
+++ b/datasets/faces/source
@@ -0,0 +1 @@
+http://isomap.stanford.edu/datasets.html
-- 
cgit v1.2.3