path: root/datasets
diff options
authorSamuel Fadel <samuelfadel@gmail.com>2016-08-19 14:20:57 -0300
committerSamuel Fadel <samuelfadel@gmail.com>2016-08-19 14:20:57 -0300
commitb255338295587246292dc978e7d4d5687ee01fb4 (patch)
tree1581b76a03f4929c5132dcb3c6920fa761f8261c /datasets
parentfbf8d82cdd3720c4bbf2a94035b6779e56d73448 (diff)
Scripts and other files for building all datasets.
Diffstat (limited to 'datasets')
17 files changed, 4482 insertions, 0 deletions
diff --git a/datasets/faces/faces_extract.py b/datasets/faces/faces_extract.py
new file mode 100644
index 0000000..3e8b4f3
--- /dev/null
+++ b/datasets/faces/faces_extract.py
@@ -0,0 +1,81 @@
+from scipy.io import loadmat
+from scipy.misc import imsave
+from sklearn.decomposition import PCA
+import hashlib
+import logging
+import numpy as np
+import os
+import os.path
+import sklearn.decomposition
+import subprocess
+import wget
+# Original data
+DATA_URL = "http://isomap.stanford.edu/face_data.mat.Z"
+SHA256_DIGEST = "9c5bc75f204071bbd340aa3ff584757ec784b0630206e526d4cd3809f2650a8a"
+# Local name
+DATA_FNAME = "face_data.mat"
+# Output files/directories
+IMG_DIR = "images"
+IMG_FNAME = "face_raw.tbl"
+LIGHTS_FNAME = "face_lights.tbl"
+POSES_FNAME = "face_poses.tbl"
+PCA_FNAME = "faces.tbl"
+if __name__ == "__main__":
+ logging.basicConfig(filename="faces_extract.log",
+ format="%(levelname)s:%(message)s",
+ level=logging.INFO)
+ # Get original data
+ if not os.path.exists(DATA_FNAME):
+ if not os.path.exists("{}.Z".format(DATA_FNAME)):
+ logging.info("Downloading faces data from '{}'".format(DATA_URL))
+ wget.download(DATA_URL, "{}.Z".format(DATA_FNAME))
+ logging.info("Checking SHA-1 digest")
+ with open("{}.Z".format(DATA_FNAME), "rb") as f:
+ if hashlib.sha256(f.read()).hexdigest() != SHA256_DIGEST:
+ logging.error("File seems corrupted; aborting")
+ exit(1)
+ logging.info("Uncompressing data into '{}'".format(DATA_FNAME))
+ subprocess.call(["uncompress", "{}.Z".format(DATA_FNAME)])
+ # We have the original data; proceed
+ logging.info("Loading faces data")
+ faces = loadmat(DATA_FNAME)
+ face_images = faces["images"]
+ logging.info("Writing image table data to {}".format(IMG_FNAME))
+ np.savetxt(IMG_FNAME, face_images.T, fmt="%f")
+ if not os.path.exists(IMG_DIR):
+ logging.info("Creating directory {}".format(IMG_DIR))
+ os.makedirs(IMG_DIR, 0o755)
+ elif not os.path.isdir(IMG_DIR):
+ logging.error("File {} exists; aborting".format(IMG_DIR))
+ exit(1)
+ logging.info("Writing image files to {}".format(IMG_DIR))
+ for i in range(face_images.shape[1]):
+ image = face_images[:, i]
+ image = image.reshape(64, 64).T
+ path = os.path.join(IMG_DIR, "{}.png".format(i))
+ imsave(path, image)
+ logging.info("Writing lights data to {}".format(LIGHTS_FNAME))
+ np.savetxt(LIGHTS_FNAME, faces["lights"].T, fmt="%f")
+ logging.info("Writing poses data to {}".format(POSES_FNAME))
+ np.savetxt(POSES_FNAME, faces["poses"].T, fmt="%f")
+ logging.info("Writing PCA-whitened data to {}".format(PCA_FNAME))
+ X = faces["images"].T
+ X = PCA(n_components=256, whiten=True).fit_transform(X)
+ np.savetxt(PCA_FNAME, X, fmt="%f")
diff --git a/datasets/faces/source b/datasets/faces/source
new file mode 100644
index 0000000..e89da9b
--- /dev/null
+++ b/datasets/faces/source
@@ -0,0 +1 @@
diff --git a/datasets/mnist/mnist_extract.py b/datasets/mnist/mnist_extract.py
new file mode 100644
index 0000000..403b250
--- /dev/null
+++ b/datasets/mnist/mnist_extract.py
@@ -0,0 +1,148 @@
+from array import array as pyarray
+from scipy.io import loadmat
+from sklearn.decomposition import PCA
+import gzip
+import hashlib
+import logging
+import numpy as np
+import os
+import os.path
+import struct
+import sys
+import wget
+TRAIN_IMAGES_URL = "http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz"
+TRAIN_LABELS_URL = "http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz"
+TEST_IMAGES_URL = "http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz"
+TEST_LABELS_URL = "http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz"
+TRAIN_IMAGES_SHA256 = "440fcabf73cc546fa21475e81ea370265605f56be210a4024d2ca8f203523609"
+TRAIN_LABELS_SHA256 = "3552534a0a558bbed6aed32b30c495cca23d567ec52cac8be1a0730e8010255c"
+TEST_IMAGES_SHA256 = "8d422c7b0a1c1c79245a5bcf07fe86e33eeafee792b84584aec276f5a2dbc4e6"
+TEST_LABELS_SHA256 = "f7ae60f92e00ec6debd23a6088c31dbd2371eca3ffa0defaefb259924204aec6"
+TRAIN_SAMPLE_INDICES_FNAME = "mnist_train_sample.tbl"
+TEST_SAMPLE_INDICES_FNAME = "mnist_test_sample.tbl"
+ 'train': 'train-images-idx3-ubyte.gz',
+ 'test': 't10k-images-idx3-ubyte.gz'
+ 'train': 'train-labels-idx1-ubyte.gz',
+ 'test': 't10k-labels-idx1-ubyte.gz'
+def download_and_check(in_url, out_fname, sha256sum):
+ logging.info("Downloading '{}'".format(in_url))
+ wget.download(in_url, out_fname)
+ valid = False
+ with open(out_fname, "rb") as f:
+ valid = (hashlib.sha256(f.read()).hexdigest() == sha256sum)
+ return valid
+def load_mnist(data="train", digits=np.arange(10)):
+ fname_img = FNAME_IMG[data]
+ fname_lbl = FNAME_LBL[data]
+ with gzip.open(fname_lbl, 'rb') as flbl:
+ magic_nr, size = struct.unpack(">II", flbl.read(8))
+ lbl = pyarray("b", flbl.read())
+ with gzip.open(fname_img, 'rb') as fimg:
+ magic_nr, size, rows, cols = struct.unpack(">IIII", fimg.read(16))
+ img = pyarray("B", fimg.read())
+ ind = [k for k in range(size) if lbl[k] in digits]
+ N = len(ind)
+ images = np.zeros((N, rows*cols), dtype=np.uint8)
+ labels = np.zeros((N, 1), dtype=np.int8)
+ for i in range(len(ind)):
+ m = ind[i]*rows*cols
+ n = (ind[i]+1)*rows*cols
+ images[i] = np.array(img[m:n])
+ labels[i] = lbl[ind[i]]
+ return images, labels
+if __name__ == "__main__":
+ logging.basicConfig(filename="mnist_extract.log",
+ format="%(levelname)s:%(message)s",
+ level=logging.INFO)
+ # Get and check original data if needed
+ fnames = [FNAME_IMG['train'], FNAME_LBL['train'],
+ FNAME_IMG['test'], FNAME_LBL['test']]
+ sha256sums = [TRAIN_IMAGES_SHA256, TRAIN_LABELS_SHA256,
+ for url, fname, sha256sum in zip(urls, fnames, sha256sums):
+ if not os.path.exists(fname):
+ ok = download_and_check(url, fname, sha256sum)
+ if not ok:
+ logging.error("'{}' is corrupted; aborting".format(fname))
+ exit(1)
+ # We now have the original data
+ logging.info("Loading MNIST training data")
+ mnist_train = dict()
+ mnist_train['train_X'], mnist_train['train_labels'] = load_mnist("train")
+ train_size = mnist_train['train_X'].shape[0]
+ logging.info("Loading MNIST test data")
+ mnist_test = dict()
+ mnist_test['test_X'], mnist_test['test_labels'] = load_mnist("test")
+ test_size = mnist_test['test_X'].shape[0]
+ should_load_samples = False
+ if len(sys.argv) == 2 \
+ or (not os.path.exists(TRAIN_SAMPLE_INDICES_FNAME)) \
+ or (not os.path.exists(TEST_SAMPLE_INDICES_FNAME)):
+ sample_size = int(sys.argv[1])
+ if sample_size/2 > min(train_size, test_size):
+ print("sample size is too large")
+ should_load_samples = True
+ else:
+ logging.info("Generating {} samples".format(sample_size))
+ train_sample_indices = np.randint(0, train_size, sample_size / 2)
+ test_sample_indices = np.randint(0, test_size, sample_size / 2)
+ logging.info("Saving generated samples")
+ np.savetxt("mnist_train_sample.tbl", train_sample_indices, fmt="%u")
+ np.savetxt("mnist_test_sample.tbl", test_sample_indices, fmt="%u")
+ else:
+ should_load_samples = True
+ if should_load_samples:
+ logging.info("Loading samples")
+ train_sample_indices = np.loadtxt(TRAIN_SAMPLE_INDICES_FNAME, dtype=int)
+ test_sample_indices = np.loadtxt(TEST_SAMPLE_INDICES_FNAME, dtype=int)
+ sample_size = train_sample_indices.shape[0] \
+ + test_sample_indices.shape[0]
+ logging.info("Extracting {} samples".format(sample_size))
+ train_samples = mnist_train['train_X'][train_sample_indices, :]
+ test_samples = mnist_test['test_X'][test_sample_indices, :]
+ mnist_sample = np.concatenate((train_samples, test_samples))
+ mnist_sample = PCA(n_components=512, whiten=True).fit_transform(mnist_sample)
+ train_labels = mnist_train['train_labels'][train_sample_indices]
+ test_labels = mnist_test['test_labels'][test_sample_indices]
+ mnist_sample_labels = np.concatenate((train_labels, test_labels))
+ logging.info("Saving extracted samples and their labels")
+ sample_fname = "mnist_{}.tbl".format(sample_size)
+ labels_fname = "mnist_{}.labels".format(sample_size)
+ np.savetxt(sample_fname, mnist_sample, fmt="%f")
+ np.savetxt(labels_fname, mnist_sample_labels, fmt="%u")
diff --git a/datasets/mnist/mnist_test_sample.tbl b/datasets/mnist/mnist_test_sample.tbl
new file mode 100644
index 0000000..01dbdb4
--- /dev/null
+++ b/datasets/mnist/mnist_test_sample.tbl
@@ -0,0 +1,1000 @@
diff --git a/datasets/mnist/mnist_train_sample.tbl b/datasets/mnist/mnist_train_sample.tbl
new file mode 100644
index 0000000..0b136db
--- /dev/null
+++ b/datasets/mnist/mnist_train_sample.tbl
@@ -0,0 +1,1000 @@
diff --git a/datasets/mnist/source b/datasets/mnist/source
new file mode 100644
index 0000000..19ebae4
--- /dev/null
+++ b/datasets/mnist/source
@@ -0,0 +1 @@
diff --git a/datasets/newsgroups/README b/datasets/newsgroups/README
new file mode 100644
index 0000000..78e92ee
--- /dev/null
+++ b/datasets/newsgroups/README
@@ -0,0 +1,7 @@
+For running the script newsgroups_extract.py we used N = 500 and topics are
+'comp.graphics', 'misc.forsale', and 'sci.med'.
+The randomly generated ids in our case are in the file 'newsgroups-500-3.ids'.
+Before running the script, be sure to generate the stopwords file using
diff --git a/datasets/newsgroups/newsgroups-500-3.ids b/datasets/newsgroups/newsgroups-500-3.ids
new file mode 100644
index 0000000..9b468bd
--- /dev/null
+++ b/datasets/newsgroups/newsgroups-500-3.ids
@@ -0,0 +1,1493 @@
diff --git a/datasets/newsgroups/newsgroups_extract.py b/datasets/newsgroups/newsgroups_extract.py
new file mode 100644
index 0000000..51c5030
--- /dev/null
+++ b/datasets/newsgroups/newsgroups_extract.py
@@ -0,0 +1,137 @@
+from sklearn.decomposition import PCA
+from sklearn.feature_extraction.text import TfidfVectorizer
+import hashlib
+import logging
+import numpy as np
+import os
+import os.path
+import sys
+import tarfile
+import wget
+DATA_URL = "http://kdd.ics.uci.edu/databases/20newsgroups/20_newsgroups.tar.gz"
+DATA_FILE = "20_newsgroups.tar.gz"
+DATA_SHA256 = "b7bbf82b7831f7dbb1a09d9312f66fa78565c8de25526999b0d66f69d37e414"
+def build_topic_corpus(corpus_file, n, topic):
+ logging.info("Extracting corpus for topic '{}'".format(topic))
+ topic_items = []
+ names = corpus_file.getnames()
+ for name in names:
+ if topic in name:
+ ti = corpus_file.getmember(name)
+ if ti.isfile():
+ topic_items.append(name)
+ if len(topic_items) == 0:
+ # Topic does not exist (no items fetched)
+ raise ValueError(topic)
+ topic_ids = []
+ topic_corpus = []
+ indices = np.arange(len(topic_items))
+ np.random.shuffle(indices)
+ indices = indices[:n]
+ for i in indices:
+ ti = corpus_file.getmember(topic_items[i])
+ with corpus_file.extractfile(ti) as f:
+ try:
+ contents = str(f.read(), encoding="utf8")
+ except ValueError as e:
+ logging.warn("Encoding error in '{}': {}".format(ti.name, e))
+ continue
+ _, item_id = os.path.split(ti.name)
+ topic_ids.append(item_id)
+ topic_corpus.append(contents)
+ return topic_ids, topic_corpus
+def build_corpus(n, topics):
+ """
+ Builds a corpus with each topic, with N items each.
+ Returns a list of document IDs and a corpus which is a dict where each topic
+ is a key mapped to a list of document contents.
+ """
+ ids = []
+ corpus = dict()
+ with tarfile.open(DATA_FILE, "r:gz") as f:
+ for topic in topics:
+ topic_ids, topic_corpus = build_topic_corpus(f, n, topic)
+ corpus[topic] = topic_corpus
+ ids.extend(topic_ids)
+ return ids, corpus
+if __name__ == "__main__":
+ if len(sys.argv) < 4:
+ print("usage: {} STOP_WORDS N TOPIC [ TOPIC [ ... ] ]".format(sys.argv[0]))
+ print("The program reads the file STOP_WORDS for stop words, extracts"
+ + " and generates a BoW model from N random articles of each TOPIC")
+ exit(1)
+ logging.basicConfig(filename="newsgroups_extract.log",
+ format="%(levelname)s:%(message)s",
+ level=logging.INFO)
+ if not os.path.exists(DATA_FILE):
+ logging.info("Downloading data from '{}'".format(DATA_URL))
+ wget.download(DATA_URL, DATA_FILE)
+ with open(DATA_FILE, "rb") as f:
+ if not hashlib.sha256(f.read()).hexdigest() != DATA_SHA256:
+ logging.error("'{}' is corrupted; aborting".format(DATA_FILE))
+ exit(1)
+ # Read stop words list
+ try:
+ with open(sys.argv[1]) as stop_words_file:
+ stop_words = stop_words_file.read().split()
+ except Exception as e:
+ logging.error("Could not read stop words: {}".format(e))
+ exit(1)
+ try:
+ n = int(sys.argv[2])
+ if (n < 2) or (n > 1000):
+ raise ValueError("N must be between 2 and 1000")
+ except ValueError as e:
+ logging.error("Invalid argument: {}".format(e))
+ exit(1)
+ # Extract text corpus from tarball
+ logging.info("Building corpus")
+ topics = sys.argv[3:]
+ try:
+ ids, corpus = build_corpus(n, topics)
+ except ValueError as e:
+ logging.error("Invalid topic: {}".format(e))
+ exit(1)
+ corpus_text = []
+ for topic_items in corpus.values():
+ corpus_text.extend(topic_items)
+ # Compute the TF-IDF matrix
+ logging.info("Computing TF-IDF matrix")
+ vectorizer = TfidfVectorizer(min_df=0.01, stop_words=stop_words)
+ X = vectorizer.fit_transform(corpus_text)
+ # Reduce data dimensionality using PCA
+ logging.info("Computing PCA and reducing to 512 dimensions")
+ X = PCA(n_components=512, whiten=True).fit_transform(X.toarray())
+ # Save all extracted features and related data
+ logging.info("Writing IDs file")
+ ids_fname = "newsgroups-{}-{}.ids".format(n, len(topics))
+ np.savetxt(ids_fname, ids, fmt="%s")
+ logging.info("Writing table file")
+ tbl_fname = "newsgroups-{}-{}.tbl".format(n, len(topics))
+ np.savetxt(tbl_fname, X.todense(), fmt="%f")
+ logging.info("Writing labels file")
+ labels_fname = "newsgroups-{}-{}.labels".format(n, len(topics))
+ counts = [len(topic_items) for topic_items in corpus.values()]
+ np.savetxt(labels_fname, np.repeat(topics, counts), fmt="%s")
diff --git a/datasets/newsgroups/source b/datasets/newsgroups/source
new file mode 100644
index 0000000..764f792
--- /dev/null
+++ b/datasets/newsgroups/source
@@ -0,0 +1 @@
diff --git a/datasets/newsgroups/stop.sh b/datasets/newsgroups/stop.sh
new file mode 100644
index 0000000..36a5f74
--- /dev/null
+++ b/datasets/newsgroups/stop.sh
@@ -0,0 +1,12 @@
+# stop.sh
+# Generate proper stop words list from the 'stop.txt' file.
+# Original source: http://snowball.tartarus.org/algorithms/english/stop.txt
+# NOTE: in our experiments, stop.txt has been modified to include the last stop
+# words (stop.txt is included).
+sed 's/|.*//g' <stop.txt \
+ | sed 's/ \+//g' \
+ | sed '/^$/d' >words.txt
diff --git a/datasets/newsgroups/stop.txt b/datasets/newsgroups/stop.txt
new file mode 100644
index 0000000..5d0a34b
--- /dev/null
+++ b/datasets/newsgroups/stop.txt
@@ -0,0 +1,310 @@
+ | An English stop word list. Comments begin with vertical bar. Each stop
+ | word is at the start of a line.
+ | Many of the forms below are quite rare (e.g. "yourselves") but included for
+ | completeness.
+ | 1st person sing
+i | subject, always in upper case of course
+me | object
+my | possessive adjective
+ | the possessive pronoun `mine' is best suppressed, because of the
+ | sense of coal-mine etc.
+myself | reflexive
+ | 1st person plural
+we | subject
+| us | object
+ | care is required here because US = United States. It is usually
+ | safe to remove it if it is in lower case.
+our | possessive adjective
+ours | possessive pronoun
+ourselves | reflexive
+ | second person (archaic `thou' forms not included)
+you | subject and object
+your | possessive adjective
+yours | possessive pronoun
+yourself | reflexive (singular)
+yourselves | reflexive (plural)
+ | third person singular
+he | subject
+him | object
+his | possessive adjective and pronoun
+himself | reflexive
+she | subject
+her | object and possessive adjective
+hers | possessive pronoun
+herself | reflexive
+it | subject and object
+its | possessive adjective
+itself | reflexive
+ | third person plural
+they | subject
+them | object
+their | possessive adjective
+theirs | possessive pronoun
+themselves | reflexive
+ | other forms (demonstratives, interrogatives)
+ | VERB FORMS (using F.R. Palmer's nomenclature)
+ | BE
+am | 1st person, present
+is | -s form (3rd person, present)
+are | present
+was | 1st person, past
+were | past
+be | infinitive
+been | past participle
+being | -ing form
+ | HAVE
+have | simple
+has | -s form
+had | past
+having | -ing form
+ | DO
+do | simple
+does | -s form
+did | past
+doing | -ing form
+ | The forms below are, I believe, best omitted, because of the significant
+ | homonym forms:
+ | He made a WILL
+ | old tin CAN
+ | merry month of MAY
+ | a smell of MUST
+ | fight the good fight with all thy MIGHT
+ | would, could, should, ought might however be included
+ | | WILL
+ |will
+ | | SHALL
+ |shall
+ | | CAN
+ |can
+ | | MAY
+ |may
+ |might
+ | | MUST
+ |must
+ | | OUGHT
+ | COMPOUND FORMS, increasingly encountered nowadays in 'formal' writing
+ | pronoun + verb
+ | verb + negation
+ | auxiliary + negation
+ | miscellaneous forms
+ | rarer forms
+ | daren't needn't
+ | doubtful forms
+ | oughtn't mightn't
+ | THE REST (Overlap among prepositions, conjunctions, adverbs etc is so
+ | high, that classification is pointless.)
diff --git a/datasets/newsgroups/words.txt b/datasets/newsgroups/words.txt
new file mode 100644
index 0000000..0d11300
--- /dev/null
+++ b/datasets/newsgroups/words.txt
@@ -0,0 +1,216 @@
diff --git a/datasets/segmentation/segmentation_extract.py b/datasets/segmentation/segmentation_extract.py
new file mode 100644
index 0000000..e621161
--- /dev/null
+++ b/datasets/segmentation/segmentation_extract.py
@@ -0,0 +1,39 @@
+import hashlib
+import logging
+import pandas as pd
+import os
+import os.path
+import wget
+DATA_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/image/segmentation.test"
+DATA_SHA256 = "2e9e966479d54c6aaec309059376dd9c89c1b46bf3a23aceeefb36d20d93a189"
+DATA_FILE = "segmentation.test"
+if __name__ == "__main__":
+ logging.basicConfig(filename="segmentation_extract.log",
+ format="%(levelname)s:%(message)s",
+ level=logging.INFO)
+ if not os.path.exists(DATA_FILE):
+ logging.info("Downloading '{}'".format(DATA_URL))
+ wget.download(DATA_URL, DATA_FILE)
+ with open(DATA_FILE, "rb") as f:
+ if hashlib.sha256(f.read()).hexdigest() != DATA_SHA256:
+ logging.error("{} is corrupted; aborting".format(DATA_FILE))
+ df = pd.read_table(DATA_FILE, header=None, skiprows=4, delimiter=",")
+ # First column contains class names, which we convert to numbers using the
+ # 'class_labels' dict
+ classes = set(df[0])
+ numbers = [i for i in range(len(classes))]
+ class_labels = dict(zip(classes, numbers))
+ data = df.drop([0, 3], axis=1)
+ data.to_csv("segmentation.tbl", sep=" ", index=False, header=False)
+ labels = df[0].apply(lambda x: class_labels[x])
+ labels.to_csv("segmentation.labels", sep=" ", index=False, header=False)
diff --git a/datasets/segmentation/source b/datasets/segmentation/source
new file mode 100644
index 0000000..ab98436
--- /dev/null
+++ b/datasets/segmentation/source
@@ -0,0 +1 @@
diff --git a/datasets/wdbc/source b/datasets/wdbc/source
new file mode 100644
index 0000000..67d201a
--- /dev/null
+++ b/datasets/wdbc/source
@@ -0,0 +1 @@
diff --git a/datasets/wdbc/wdbc_extract.py b/datasets/wdbc/wdbc_extract.py
new file mode 100644
index 0000000..9b6b84a
--- /dev/null
+++ b/datasets/wdbc/wdbc_extract.py
@@ -0,0 +1,34 @@
+import hashlib
+import logging
+import pandas as pd
+import os
+import os.path
+import wget
+DATA_URL = "http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
+DATA_SHA256 = "d606af411f3e5be8a317a5a8b652b425aaf0ff38ca683d5327ffff94c3695f4a"
+DATA_FILE = "wdbc.data"
+if __name__ == "__main__":
+ logging.basicConfig(filename="wdbc_extract.log",
+ format="%(levelname)s:%(message)s",
+ level=logging.INFO)
+ if not os.path.exists(DATA_FILE):
+ logging.info("Downloading '{}".format(DATA_URL))
+ wget.download(DATA_URL, DATA_FILE)
+ with open(DATA_FILE, "rb") as f:
+ if hashlib.sha256(f.read()).hexdigest() != DATA_SHA256:
+ logging.error("'{}' is corrupted; aborting".format(DATA_FILE))
+ exit(1)
+ data = pd.read_table(DATA_FILE, header=None, delimiter=",")
+ wdbc_ids = data[0]
+ wdbc_labels = data[1]
+ wdbc = data.drop([0, 1], axis=1)
+ wdbc.to_csv("wdbc.tbl", sep=" ", index=False, header=False)
+ wdbc_labels.to_csv("wdbc.labels", sep=" ", index=False, header=False)
+ wdbc_ids.to_csv("wdbc.ids", sep=" ", index=False, header=False)