diff options
author | Samuel Fadel <samuelfadel@gmail.com> | 2016-08-19 14:20:57 -0300 |
---|---|---|
committer | Samuel Fadel <samuelfadel@gmail.com> | 2016-08-19 14:20:57 -0300 |
commit | b255338295587246292dc978e7d4d5687ee01fb4 (patch) | |
tree | 1581b76a03f4929c5132dcb3c6920fa761f8261c /datasets/wdbc | |
parent | fbf8d82cdd3720c4bbf2a94035b6779e56d73448 (diff) |
Scripts and other files for building all datasets.
Diffstat (limited to 'datasets/wdbc')
-rw-r--r-- | datasets/wdbc/source | 1 | ||||
-rw-r--r-- | datasets/wdbc/wdbc_extract.py | 34 |
2 files changed, 35 insertions, 0 deletions
diff --git a/datasets/wdbc/source b/datasets/wdbc/source new file mode 100644 index 0000000..67d201a --- /dev/null +++ b/datasets/wdbc/source @@ -0,0 +1 @@ +http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/ diff --git a/datasets/wdbc/wdbc_extract.py b/datasets/wdbc/wdbc_extract.py new file mode 100644 index 0000000..9b6b84a --- /dev/null +++ b/datasets/wdbc/wdbc_extract.py @@ -0,0 +1,34 @@ +import hashlib +import logging +import pandas as pd +import os +import os.path +import wget + + +DATA_URL = "http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data" +DATA_SHA256 = "d606af411f3e5be8a317a5a8b652b425aaf0ff38ca683d5327ffff94c3695f4a" +DATA_FILE = "wdbc.data" + + +if __name__ == "__main__": + logging.basicConfig(filename="wdbc_extract.log", + format="%(levelname)s:%(message)s", + level=logging.INFO) + + if not os.path.exists(DATA_FILE): + logging.info("Downloading '{}".format(DATA_URL)) + wget.download(DATA_URL, DATA_FILE) + with open(DATA_FILE, "rb") as f: + if hashlib.sha256(f.read()).hexdigest() != DATA_SHA256: + logging.error("'{}' is corrupted; aborting".format(DATA_FILE)) + exit(1) + + data = pd.read_table(DATA_FILE, header=None, delimiter=",") + wdbc_ids = data[0] + wdbc_labels = data[1] + wdbc = data.drop([0, 1], axis=1) + + wdbc.to_csv("wdbc.tbl", sep=" ", index=False, header=False) + wdbc_labels.to_csv("wdbc.labels", sep=" ", index=False, header=False) + wdbc_ids.to_csv("wdbc.ids", sep=" ", index=False, header=False) |