aboutsummaryrefslogtreecommitdiff
path: root/datasets/wdbc
diff options
context:
space:
mode:
authorSamuel Fadel <samuelfadel@gmail.com>2016-08-19 14:20:57 -0300
committerSamuel Fadel <samuelfadel@gmail.com>2016-08-19 14:20:57 -0300
commitb255338295587246292dc978e7d4d5687ee01fb4 (patch)
tree1581b76a03f4929c5132dcb3c6920fa761f8261c /datasets/wdbc
parentfbf8d82cdd3720c4bbf2a94035b6779e56d73448 (diff)
Scripts and other files for building all datasets.
Diffstat (limited to 'datasets/wdbc')
-rw-r--r--datasets/wdbc/source1
-rw-r--r--datasets/wdbc/wdbc_extract.py34
2 files changed, 35 insertions, 0 deletions
diff --git a/datasets/wdbc/source b/datasets/wdbc/source
new file mode 100644
index 0000000..67d201a
--- /dev/null
+++ b/datasets/wdbc/source
@@ -0,0 +1 @@
+http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/
diff --git a/datasets/wdbc/wdbc_extract.py b/datasets/wdbc/wdbc_extract.py
new file mode 100644
index 0000000..9b6b84a
--- /dev/null
+++ b/datasets/wdbc/wdbc_extract.py
@@ -0,0 +1,34 @@
+import hashlib
+import logging
+import pandas as pd
+import os
+import os.path
+import wget
+
+
+DATA_URL = "http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
+DATA_SHA256 = "d606af411f3e5be8a317a5a8b652b425aaf0ff38ca683d5327ffff94c3695f4a"
+DATA_FILE = "wdbc.data"
+
+
+if __name__ == "__main__":
+ logging.basicConfig(filename="wdbc_extract.log",
+ format="%(levelname)s:%(message)s",
+ level=logging.INFO)
+
+ if not os.path.exists(DATA_FILE):
+ logging.info("Downloading '{}".format(DATA_URL))
+ wget.download(DATA_URL, DATA_FILE)
+ with open(DATA_FILE, "rb") as f:
+ if hashlib.sha256(f.read()).hexdigest() != DATA_SHA256:
+ logging.error("'{}' is corrupted; aborting".format(DATA_FILE))
+ exit(1)
+
+ data = pd.read_table(DATA_FILE, header=None, delimiter=",")
+ wdbc_ids = data[0]
+ wdbc_labels = data[1]
+ wdbc = data.drop([0, 1], axis=1)
+
+ wdbc.to_csv("wdbc.tbl", sep=" ", index=False, header=False)
+ wdbc_labels.to_csv("wdbc.labels", sep=" ", index=False, header=False)
+ wdbc_ids.to_csv("wdbc.ids", sep=" ", index=False, header=False)