diff options
author | Samuel Fadel <samuelfadel@gmail.com> | 2016-08-19 14:20:57 -0300 |
---|---|---|
committer | Samuel Fadel <samuelfadel@gmail.com> | 2016-08-19 14:20:57 -0300 |
commit | b255338295587246292dc978e7d4d5687ee01fb4 (patch) | |
tree | 1581b76a03f4929c5132dcb3c6920fa761f8261c | |
parent | fbf8d82cdd3720c4bbf2a94035b6779e56d73448 (diff) |
Scripts and other files for building all datasets.
-rw-r--r-- | datasets/faces/faces_extract.py | 81 | ||||
-rw-r--r-- | datasets/faces/source | 1 | ||||
-rw-r--r-- | datasets/mnist/mnist_extract.py | 148 | ||||
-rw-r--r-- | datasets/mnist/mnist_test_sample.tbl | 1000 | ||||
-rw-r--r-- | datasets/mnist/mnist_train_sample.tbl | 1000 | ||||
-rw-r--r-- | datasets/mnist/source | 1 | ||||
-rw-r--r-- | datasets/newsgroups/README | 7 | ||||
-rw-r--r-- | datasets/newsgroups/newsgroups-500-3.ids | 1493 | ||||
-rw-r--r-- | datasets/newsgroups/newsgroups_extract.py | 137 | ||||
-rw-r--r-- | datasets/newsgroups/source | 1 | ||||
-rw-r--r-- | datasets/newsgroups/stop.sh | 12 | ||||
-rw-r--r-- | datasets/newsgroups/stop.txt | 310 | ||||
-rw-r--r-- | datasets/newsgroups/words.txt | 216 | ||||
-rw-r--r-- | datasets/segmentation/segmentation_extract.py | 39 | ||||
-rw-r--r-- | datasets/segmentation/source | 1 | ||||
-rw-r--r-- | datasets/wdbc/source | 1 | ||||
-rw-r--r-- | datasets/wdbc/wdbc_extract.py | 34 |
17 files changed, 4482 insertions, 0 deletions
diff --git a/datasets/faces/faces_extract.py b/datasets/faces/faces_extract.py new file mode 100644 index 0000000..3e8b4f3 --- /dev/null +++ b/datasets/faces/faces_extract.py @@ -0,0 +1,81 @@ +from scipy.io import loadmat +from scipy.misc import imsave +from sklearn.decomposition import PCA + +import hashlib +import logging +import numpy as np +import os +import os.path +import sklearn.decomposition +import subprocess +import wget + + +# Original data +DATA_URL = "http://isomap.stanford.edu/face_data.mat.Z" +SHA256_DIGEST = "9c5bc75f204071bbd340aa3ff584757ec784b0630206e526d4cd3809f2650a8a" + +# Local name +DATA_FNAME = "face_data.mat" + +# Output files/directories +IMG_DIR = "images" +IMG_FNAME = "face_raw.tbl" +LIGHTS_FNAME = "face_lights.tbl" +POSES_FNAME = "face_poses.tbl" +PCA_FNAME = "faces.tbl" + + +if __name__ == "__main__": + logging.basicConfig(filename="faces_extract.log", + format="%(levelname)s:%(message)s", + level=logging.INFO) + + # Get original data + if not os.path.exists(DATA_FNAME): + if not os.path.exists("{}.Z".format(DATA_FNAME)): + logging.info("Downloading faces data from '{}'".format(DATA_URL)) + wget.download(DATA_URL, "{}.Z".format(DATA_FNAME)) + + logging.info("Checking SHA-1 digest") + with open("{}.Z".format(DATA_FNAME), "rb") as f: + if hashlib.sha256(f.read()).hexdigest() != SHA256_DIGEST: + logging.error("File seems corrupted; aborting") + exit(1) + + logging.info("Uncompressing data into '{}'".format(DATA_FNAME)) + subprocess.call(["uncompress", "{}.Z".format(DATA_FNAME)]) + + # We have the original data; proceed + logging.info("Loading faces data") + faces = loadmat(DATA_FNAME) + + face_images = faces["images"] + logging.info("Writing image table data to {}".format(IMG_FNAME)) + np.savetxt(IMG_FNAME, face_images.T, fmt="%f") + + if not os.path.exists(IMG_DIR): + logging.info("Creating directory {}".format(IMG_DIR)) + os.makedirs(IMG_DIR, 0o755) + elif not os.path.isdir(IMG_DIR): + logging.error("File {} exists; aborting".format(IMG_DIR)) + exit(1) + + logging.info("Writing image files to {}".format(IMG_DIR)) + for i in range(face_images.shape[1]): + image = face_images[:, i] + image = image.reshape(64, 64).T + path = os.path.join(IMG_DIR, "{}.png".format(i)) + imsave(path, image) + + logging.info("Writing lights data to {}".format(LIGHTS_FNAME)) + np.savetxt(LIGHTS_FNAME, faces["lights"].T, fmt="%f") + + logging.info("Writing poses data to {}".format(POSES_FNAME)) + np.savetxt(POSES_FNAME, faces["poses"].T, fmt="%f") + + logging.info("Writing PCA-whitened data to {}".format(PCA_FNAME)) + X = faces["images"].T + X = PCA(n_components=256, whiten=True).fit_transform(X) + np.savetxt(PCA_FNAME, X, fmt="%f") diff --git a/datasets/faces/source b/datasets/faces/source new file mode 100644 index 0000000..e89da9b --- /dev/null +++ b/datasets/faces/source @@ -0,0 +1 @@ +http://isomap.stanford.edu/datasets.html diff --git a/datasets/mnist/mnist_extract.py b/datasets/mnist/mnist_extract.py new file mode 100644 index 0000000..403b250 --- /dev/null +++ b/datasets/mnist/mnist_extract.py @@ -0,0 +1,148 @@ +from array import array as pyarray +from scipy.io import loadmat +from sklearn.decomposition import PCA + +import gzip +import hashlib +import logging +import numpy as np +import os +import os.path +import struct +import sys +import wget + + +TRAIN_IMAGES_URL = "http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz" +TRAIN_LABELS_URL = "http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz" +TEST_IMAGES_URL = "http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz" +TEST_LABELS_URL = "http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz" + +TRAIN_IMAGES_SHA256 = "440fcabf73cc546fa21475e81ea370265605f56be210a4024d2ca8f203523609" +TRAIN_LABELS_SHA256 = "3552534a0a558bbed6aed32b30c495cca23d567ec52cac8be1a0730e8010255c" +TEST_IMAGES_SHA256 = "8d422c7b0a1c1c79245a5bcf07fe86e33eeafee792b84584aec276f5a2dbc4e6" +TEST_LABELS_SHA256 = "f7ae60f92e00ec6debd23a6088c31dbd2371eca3ffa0defaefb259924204aec6" + +TRAIN_SAMPLE_INDICES_FNAME = "mnist_train_sample.tbl" +TEST_SAMPLE_INDICES_FNAME = "mnist_test_sample.tbl" + +FNAME_IMG = { + 'train': 'train-images-idx3-ubyte.gz', + 'test': 't10k-images-idx3-ubyte.gz' +} + +FNAME_LBL = { + 'train': 'train-labels-idx1-ubyte.gz', + 'test': 't10k-labels-idx1-ubyte.gz' +} + + +def download_and_check(in_url, out_fname, sha256sum): + logging.info("Downloading '{}'".format(in_url)) + wget.download(in_url, out_fname) + + valid = False + with open(out_fname, "rb") as f: + valid = (hashlib.sha256(f.read()).hexdigest() == sha256sum) + + return valid + + +def load_mnist(data="train", digits=np.arange(10)): + fname_img = FNAME_IMG[data] + fname_lbl = FNAME_LBL[data] + + with gzip.open(fname_lbl, 'rb') as flbl: + magic_nr, size = struct.unpack(">II", flbl.read(8)) + lbl = pyarray("b", flbl.read()) + + with gzip.open(fname_img, 'rb') as fimg: + magic_nr, size, rows, cols = struct.unpack(">IIII", fimg.read(16)) + img = pyarray("B", fimg.read()) + + ind = [k for k in range(size) if lbl[k] in digits] + N = len(ind) + + images = np.zeros((N, rows*cols), dtype=np.uint8) + labels = np.zeros((N, 1), dtype=np.int8) + for i in range(len(ind)): + m = ind[i]*rows*cols + n = (ind[i]+1)*rows*cols + images[i] = np.array(img[m:n]) + labels[i] = lbl[ind[i]] + + return images, labels + + +if __name__ == "__main__": + logging.basicConfig(filename="mnist_extract.log", + format="%(levelname)s:%(message)s", + level=logging.INFO) + + # Get and check original data if needed + urls = [TRAIN_IMAGES_URL, TRAIN_LABELS_URL, + TEST_IMAGES_URL, TEST_LABELS_URL] + fnames = [FNAME_IMG['train'], FNAME_LBL['train'], + FNAME_IMG['test'], FNAME_LBL['test']] + sha256sums = [TRAIN_IMAGES_SHA256, TRAIN_LABELS_SHA256, + TEST_IMAGES_SHA256, TEST_LABELS_SHA256] + for url, fname, sha256sum in zip(urls, fnames, sha256sums): + if not os.path.exists(fname): + ok = download_and_check(url, fname, sha256sum) + if not ok: + logging.error("'{}' is corrupted; aborting".format(fname)) + exit(1) + + # We now have the original data + logging.info("Loading MNIST training data") + mnist_train = dict() + mnist_train['train_X'], mnist_train['train_labels'] = load_mnist("train") + train_size = mnist_train['train_X'].shape[0] + + logging.info("Loading MNIST test data") + mnist_test = dict() + mnist_test['test_X'], mnist_test['test_labels'] = load_mnist("test") + test_size = mnist_test['test_X'].shape[0] + + should_load_samples = False + if len(sys.argv) == 2 \ + or (not os.path.exists(TRAIN_SAMPLE_INDICES_FNAME)) \ + or (not os.path.exists(TEST_SAMPLE_INDICES_FNAME)): + sample_size = int(sys.argv[1]) + + if sample_size/2 > min(train_size, test_size): + print("sample size is too large") + should_load_samples = True + else: + logging.info("Generating {} samples".format(sample_size)) + train_sample_indices = np.randint(0, train_size, sample_size / 2) + test_sample_indices = np.randint(0, test_size, sample_size / 2) + + logging.info("Saving generated samples") + np.savetxt("mnist_train_sample.tbl", train_sample_indices, fmt="%u") + np.savetxt("mnist_test_sample.tbl", test_sample_indices, fmt="%u") + else: + should_load_samples = True + + if should_load_samples: + logging.info("Loading samples") + train_sample_indices = np.loadtxt(TRAIN_SAMPLE_INDICES_FNAME, dtype=int) + test_sample_indices = np.loadtxt(TEST_SAMPLE_INDICES_FNAME, dtype=int) + sample_size = train_sample_indices.shape[0] \ + + test_sample_indices.shape[0] + + logging.info("Extracting {} samples".format(sample_size)) + train_samples = mnist_train['train_X'][train_sample_indices, :] + test_samples = mnist_test['test_X'][test_sample_indices, :] + mnist_sample = np.concatenate((train_samples, test_samples)) + mnist_sample = PCA(n_components=512, whiten=True).fit_transform(mnist_sample) + + train_labels = mnist_train['train_labels'][train_sample_indices] + test_labels = mnist_test['test_labels'][test_sample_indices] + mnist_sample_labels = np.concatenate((train_labels, test_labels)) + + logging.info("Saving extracted samples and their labels") + sample_fname = "mnist_{}.tbl".format(sample_size) + labels_fname = "mnist_{}.labels".format(sample_size) + np.savetxt(sample_fname, mnist_sample, fmt="%f") + np.savetxt(labels_fname, mnist_sample_labels, fmt="%u") diff --git a/datasets/mnist/mnist_test_sample.tbl b/datasets/mnist/mnist_test_sample.tbl new file mode 100644 index 0000000..01dbdb4 --- /dev/null +++ b/datasets/mnist/mnist_test_sample.tbl @@ -0,0 +1,1000 @@ +3973 +5619 +2531 +173 +2631 +2965 +3560 +4015 +8898 +4444 +1512 +6548 +2198 +6312 +1296 +7560 +5442 +1117 +7187 +3760 +6127 +8284 +5753 +1355 +4525 +7358 +9941 +2250 +3971 +3997 +8080 +1439 +9378 +9748 +3982 +9763 +2350 +1694 +9278 +2478 +4600 +2931 +1877 +2024 +1665 +4573 +9992 +4573 +2805 +3637 +4888 +393 +6974 +342 +1428 +7768 +8996 +6218 +8090 +3170 +9294 +3762 +3572 +6668 +4507 +8653 +7773 +7401 +1864 +8081 +8926 +1423 +2438 +2299 +7662 +7016 +8325 +4795 +2827 +1549 +6415 +1814 +1587 +8541 +3768 +6314 +520 +3211 +3042 +7988 +2437 +8701 +7152 +6116 +6788 +1716 +2574 +222 +2507 +7909 +9310 +2515 +9552 +5978 +2747 +8850 +5236 +7072 +9609 +1773 +1996 +7011 +6411 +6782 +7557 +8347 +8594 +1019 +3593 +7665 +1496 +9937 +8440 +2913 +3808 +6796 +4860 +8834 +8010 +1539 +9196 +8876 +9689 +5137 +1135 +2322 +8701 +7240 +5049 +7570 +6400 +3681 +1554 +9645 +8644 +899 +6195 +4824 +8461 +3274 +7104 +9782 +1217 +4177 +3767 +3524 +6676 +5271 +4171 +1631 +1146 +218 +9407 +3410 +6100 +3513 +4910 +2599 +5342 +9572 +2431 +4692 +7466 +4627 +4954 +614 +1599 +4993 +9468 +1112 +5396 +571 +6039 +7624 +2345 +3136 +7770 +968 +7956 +2759 +1176 +4412 +4011 +8072 +6664 +3271 +2622 +1225 +3539 +3123 +4995 +4203 +8476 +5421 +4099 +7507 +4915 +5903 +2902 +3787 +9845 +626 +5494 +7632 +3265 +2735 +4421 +7779 +1584 +9824 +6218 +6274 +1646 +7492 +9262 +3954 +7870 +5170 +4843 +1354 +6864 +4077 +9041 +7851 +554 +1383 +5010 +83 +2836 +8388 +3615 +6271 +3648 +5977 +6805 +1980 +5002 +3512 +2502 +7348 +5785 +3173 +7501 +5926 +4718 +7266 +6330 +837 +5880 +4539 +1666 +2744 +6435 +8204 +2600 +9381 +7991 +2922 +8758 +3510 +2159 +5589 +9240 +9164 +3487 +82 +8549 +8142 +5323 +2833 +5876 +8804 +5384 +6427 +9445 +8258 +4534 +1401 +4012 +1963 +8864 +1296 +1971 +6865 +5059 +9953 +9929 +3362 +3947 +2774 +7999 +7872 +8911 +8231 +7525 +7673 +9513 +4506 +6972 +4863 +9817 +7406 +2814 +5024 +1235 +6361 +9599 +7879 +3188 +3635 +3510 +3144 +619 +7330 +9709 +9059 +6732 +1336 +6753 +5323 +3225 +7434 +1248 +3561 +9789 +584 +3547 +5062 +684 +683 +9749 +1890 +3295 +3922 +2478 +9084 +2081 +5425 +4698 +2516 +9604 +966 +5215 +3516 +4565 +4994 +9493 +4670 +870 +4205 +2392 +3959 +5435 +8656 +4656 +4864 +5566 +36 +4241 +952 +8403 +8439 +927 +2054 +6004 +9119 +7485 +7909 +4296 +4955 +2010 +1665 +1517 +284 +6760 +6873 +7480 +4833 +9706 +6572 +4516 +7929 +3593 +599 +9033 +7960 +1294 +8328 +2060 +3968 +9377 +6773 +4262 +8589 +6944 +5036 +1168 +3302 +9869 +3937 +6549 +9227 +490 +1144 +8171 +168 +3615 +177 +6932 +1850 +2162 +1383 +9091 +6706 +2185 +378 +8364 +5087 +4923 +8251 +4509 +6684 +5531 +7363 +9608 +2523 +6755 +9274 +3125 +7483 +1757 +5886 +5546 +9232 +7339 +1532 +6687 +2077 +151 +9142 +8634 +4382 +5496 +6438 +190 +8095 +1044 +1185 +5477 +2091 +9776 +9808 +7624 +8804 +5577 +4743 +9430 +9559 +8800 +5013 +6185 +5485 +3654 +6515 +2616 +6941 +8565 +774 +123 +1785 +1295 +5907 +6938 +3834 +8451 +3349 +9035 +8060 +5346 +130 +9765 +2271 +1931 +6623 +5083 +1360 +9246 +418 +9861 +7503 +2337 +9378 +8950 +1058 +9388 +6561 +9052 +2369 +9216 +4447 +3863 +4109 +9174 +4578 +7114 +7682 +1292 +9262 +4037 +9389 +977 +5536 +6542 +4658 +6248 +5812 +5368 +1902 +2241 +3998 +8438 +2537 +7300 +324 +6035 +705 +6183 +6218 +7344 +9178 +5217 +9838 +6091 +7340 +2628 +4193 +3754 +953 +7502 +3565 +4096 +4102 +5243 +2539 +6210 +4066 +9885 +4510 +1014 +6225 +4623 +4070 +9637 +7889 +2956 +7615 +6248 +4692 +9723 +7971 +6075 +9411 +7988 +9494 +2855 +1128 +4452 +2287 +6406 +9620 +3490 +6300 +4194 +976 +4855 +5826 +2230 +8451 +4355 +1056 +5916 +6232 +6178 +7111 +8189 +8229 +4582 +4987 +2504 +5209 +9065 +542 +9704 +5142 +8361 +650 +7443 +9784 +827 +3290 +5941 +7560 +1708 +7139 +9780 +4061 +2787 +1098 +2374 +4307 +474 +2027 +5593 +5711 +7421 +877 +4290 +449 +6847 +5852 +5325 +4446 +265 +2754 +1522 +251 +8177 +8549 +2520 +4924 +6980 +5069 +8282 +3407 +4092 +31 +8512 +2922 +718 +11 +8929 +8633 +543 +3798 +4298 +1608 +5033 +7498 +7880 +8875 +9322 +4915 +4635 +211 +8764 +4584 +3430 +4709 +3673 +4644 +5422 +1496 +1302 +7828 +9634 +9829 +8080 +5497 +7544 +4022 +7035 +9211 +8835 +3730 +3572 +7233 +9559 +8775 +275 +7197 +3020 +4200 +2802 +2257 +3467 +1800 +3972 +416 +5186 +6074 +7911 +6703 +6512 +9216 +771 +4106 +994 +4621 +5264 +4301 +4850 +7139 +2845 +3084 +9974 +5940 +5407 +1302 +6306 +4011 +4588 +4830 +8517 +5363 +1698 +5259 +4667 +4592 +1327 +4744 +5258 +8023 +1070 +8483 +9563 +2784 +7275 +9539 +633 +1612 +2249 +584 +1400 +1746 +4130 +6278 +4132 +1817 +5721 +6304 +7754 +6763 +4966 +4103 +350 +1445 +5864 +5294 +498 +6512 +6985 +1344 +6528 +9555 +6924 +3436 +2995 +4486 +7545 +2166 +1347 +5680 +8737 +2049 +5923 +6491 +8691 +7240 +4321 +9902 +1352 +4586 +6870 +9504 +4282 +814 +5203 +9037 +6233 +4286 +1500 +2867 +411 +2915 +804 +8028 +775 +6782 +9628 +6657 +5371 +8703 +9310 +9727 +3054 +3224 +4925 +6034 +8457 +255 +7549 +8916 +9338 +4516 +6693 +2317 +3229 +5808 +7849 +5907 +6960 +9521 +4025 +6247 +1126 +8732 +2250 +6327 +339 +8920 +532 +6762 +6966 +202 +2118 +4469 +1402 +2608 +3524 +8060 +6309 +2867 +2961 +6588 +2806 +4728 +7695 +3697 +7446 +1861 +3047 +6188 +1553 +7946 +5868 +4599 +1290 +3645 +4893 +7128 +544 +5489 +2054 +5743 +1459 +4830 +6090 +3792 +7530 +1310 +4238 +8773 +2622 +2441 +4483 +4987 +4 +1785 +3994 +8562 +9479 +8214 +3025 +5599 +5541 +3878 +1022 +516 +2749 +947 +9004 +945 +999 +1732 +6760 +3753 +408 +8403 +8616 +8808 +2863 +6291 +2927 +5566 +2551 +6242 +973 +3016 +9882 +4616 +5908 +9717 +494 +6446 +3305 +8158 +957 +8490 +8972 +1272 +700 +6195 +6060 +6416 +5052 +909 +3195 +1410 +8089 +4350 +1384 +568 +2457 +9287 +4669 +5095 +3792 +587 +3252 +2553 +7829 +4747 +387 +5560 +2238 +7800 +7076 +9380 +1121 +7393 +876 +8840 +99 +5336 +4206 +1363 +3166 +4376 +2 +5769 +6015 +3537 +8225 +0 +2105 +3266 +5232 +6300 +1668 +7178 +2302 +9710 +4617 +6516 +4722 +9427 +9090 +3217 +4941 +1897 +7119 +467 +5375 +3561 +418 +4564 +1775 +5244 +1708 +3234 +2402 +7947 +9803 +7822 +151 +8200 +510 +2616 +8291 +5053 diff --git a/datasets/mnist/mnist_train_sample.tbl b/datasets/mnist/mnist_train_sample.tbl new file mode 100644 index 0000000..0b136db --- /dev/null +++ b/datasets/mnist/mnist_train_sample.tbl @@ -0,0 +1,1000 @@ +57616 +17592 +35740 +20022 +18918 +6515 +47576 +21464 +314 +49504 +44255 +274 +12270 +18426 +47098 +15671 +21746 +15811 +55537 +55389 +13571 +34863 +28102 +37300 +23374 +9773 +45834 +43443 +13699 +32778 +48444 +27905 +39405 +31994 +35449 +2653 +39538 +31644 +55621 +49589 +33400 +48087 +37060 +5615 +24531 +29602 +13300 +40262 +19419 +22407 +59794 +59057 +22781 +58471 +44220 +7522 +51504 +20731 +40111 +23730 +18806 +2268 +48366 +59118 +10441 +52308 +3980 +20045 +43691 +7395 +46980 +31485 +11893 +29599 +59330 +15598 +18435 +30758 +2064 +57422 +7182 +19072 +23953 +38725 +41781 +38145 +16674 +31199 +34283 +36525 +14807 +22142 +38126 +2029 +26729 +12707 +20955 +8202 +41186 +55210 +46653 +53225 +13980 +36439 +38367 +19888 +40442 +22213 +10670 +15121 +33148 +38950 +20949 +47570 +7746 +34761 +34571 +57804 +2342 +44979 +45321 +48538 +31397 +58603 +40334 +11865 +4047 +38967 +41588 +12377 +17679 +59706 +43049 +1111 +47047 +55831 +5013 +20843 +25249 +43546 +39075 +7040 +40238 +55874 +40866 +17630 +40943 +45366 +32838 +7111 +31456 +20075 +57155 +51424 +45547 +55680 +20006 +49687 +51880 +2478 +59925 +8823 +35400 +6391 +6317 +32393 +18334 +33229 +52966 +610 +54534 +4053 +21797 +22954 +57184 +221 +782 +32494 +41319 +43098 +19062 +6984 +6527 +7383 +18878 +50131 +7372 +7476 +13935 +38478 +17870 +23977 +49675 +29469 +40345 +46503 +52574 +54370 +3183 +43099 +32347 +28495 +59102 +30434 +55341 +19146 +39833 +15710 +4594 +8962 +52262 +25213 +13208 +31540 +14932 +10167 +30292 +5674 +53250 +41076 +6776 +27278 +40701 +13723 +25387 +20423 +21282 +56499 +41356 +53193 +17825 +8226 +5938 +10545 +35398 +42341 +5002 +48616 +118 +1854 +19457 +47245 +28034 +26256 +19308 +11119 +40804 +15161 +27192 +57256 +56795 +13757 +21523 +1990 +29060 +16911 +45716 +30104 +47456 +56140 +7129 +44887 +8425 +31547 +45015 +19132 +26316 +21805 +42922 +59318 +56646 +2559 +56001 +27174 +45604 +43889 +7978 +2153 +16063 +3893 +15841 +19827 +50977 +37370 +46923 +59844 +38208 +46018 +41661 +6787 +25298 +31325 +44838 +50220 +29722 +49477 +40492 +27564 +52048 +50522 +39573 +20750 +8455 +22065 +55737 +10024 +44971 +58043 +27397 +47389 +19956 +49412 +9202 +25218 +29632 +19089 +15577 +34808 +50562 +46310 +23015 +20072 +48681 +34695 +56340 +15994 +16476 +22842 +44730 +16490 +45098 +1075 +36609 +17507 +17455 +49243 +14003 +26233 +17753 +801 +37771 +14320 +9597 +8226 +34532 +29634 +32143 +57209 +14431 +3626 +54846 +38836 +27628 +58887 +46146 +7617 +40997 +51786 +48037 +7515 +46117 +737 +6944 +9637 +24650 +19940 +11521 +54289 +10438 +40171 +35118 +52702 +25979 +40534 +45914 +6720 +31211 +37663 +4577 +39590 +51347 +58143 +33514 +10615 +25444 +5506 +15033 +46269 +26080 +48588 +16404 +13579 +40096 +13467 +49872 +59896 +55475 +32428 +12781 +56377 +53077 +56730 +5342 +13619 +39237 +10853 +13016 +20639 +14830 +7664 +41604 +25751 +300 +35102 +19849 +15766 +12979 +31771 +32602 +31761 +43788 +57370 +35274 +18208 +13870 +15842 +2671 +5217 +25586 +57012 +20552 +12579 +26353 +45120 +43146 +15230 +53589 +50219 +50311 +6967 +6167 +7353 +3965 +10934 +9052 +25427 +33921 +42194 +12527 +17364 +10177 +33662 +20745 +32536 +41904 +19963 +22195 +22549 +6632 +9889 +56236 +53022 +35725 +42694 +16032 +38542 +24758 +1802 +40481 +28033 +16730 +27517 +42091 +4849 +18084 +59100 +3990 +27005 +17890 +18074 +19225 +27349 +42790 +5577 +3000 +45881 +18213 +55348 +47812 +33784 +27582 +30 +39035 +24942 +5179 +22934 +7829 +22250 +50897 +52136 +4512 +54522 +33580 +57999 +6 +17499 +44035 +33174 +46357 +6980 +48263 +25670 +51411 +47402 +12477 +28570 +29781 +1105 +46944 +52178 +27981 +1945 +3431 +11008 +1925 +55090 +17868 +24114 +30978 +6540 +11376 +14505 +44289 +47336 +32765 +58824 +29040 +25884 +46350 +21710 +56361 +28692 +27269 +36220 +49106 +13907 +37738 +12228 +38044 +12753 +20518 +46432 +29569 +14361 +42860 +46855 +4726 +29445 +52094 +21769 +42226 +24806 +45423 +55215 +39094 +46734 +17129 +3640 +36628 +56080 +48031 +49386 +3173 +9713 +1636 +45135 +23810 +8566 +3271 +34111 +3968 +34837 +14182 +56039 +29414 +2625 +26194 +1865 +9577 +9867 +9636 +21600 +28828 +53985 +31700 +43341 +15693 +51745 +11953 +5350 +24821 +13598 +57255 +25230 +37624 +12629 +41842 +8366 +57945 +39079 +44896 +59269 +30424 +38785 +19199 +33009 +8067 +42738 +52697 +34452 +23321 +14719 +48976 +29272 +23271 +56086 +20336 +52107 +11830 +50826 +37389 +46229 +21872 +9314 +23795 +9193 +52074 +21327 +47624 +11887 +49012 +54328 +20828 +3109 +49819 +25723 +34310 +59136 +19174 +16688 +21371 +29697 +38450 +42881 +25583 +20355 +3672 +22386 +53670 +40080 +56620 +58651 +5132 +45963 +26799 +37046 +55683 +34252 +57630 +6523 +25146 +12865 +51186 +39016 +27828 +56666 +18486 +58665 +56209 +53839 +41163 +3409 +4117 +37738 +27035 +21013 +10042 +6348 +41410 +5989 +17595 +28447 +53010 +3887 +20584 +210 +51431 +58339 +59181 +28704 +54516 +32421 +17650 +34597 +40747 +8175 +47896 +46883 +13524 +37808 +53754 +434 +37154 +4800 +26379 +6292 +59644 +3665 +24044 +54544 +7689 +18424 +47761 +31404 +13379 +34133 +3885 +32757 +5936 +52907 +28721 +29113 +20960 +21089 +23480 +55886 +15426 +59779 +23741 +41373 +58737 +18447 +11813 +37271 +34253 +43787 +20375 +9676 +21516 +6112 +44909 +45557 +18346 +11669 +7183 +48922 +27542 +40695 +16962 +49353 +32351 +16987 +12825 +1687 +45980 +47744 +15279 +53463 +34186 +18334 +51386 +46662 +17011 +33724 +31658 +5435 +36696 +43562 +30754 +57056 +44728 +22622 +23268 +19760 +44223 +19033 +58856 +24768 +45390 +6111 +27347 +19049 +58364 +53201 +26508 +28685 +49889 +32145 +24442 +50683 +29089 +58209 +11491 +31186 +44327 +28716 +2250 +35204 +9207 +34721 +8905 +30420 +50198 +6715 +25686 +37755 +33949 +48826 +45839 +53569 +37224 +48830 +10076 +51664 +36094 +24915 +19176 +52754 +43065 +43369 +56068 +17771 +21623 +37085 +1651 +33232 +51024 +38208 +31317 +38383 +13661 +35438 +46183 +37855 +37947 +14860 +3413 +55236 +19724 +7523 +1800 +18376 +52348 +19879 +53325 +50800 +53410 +57100 +15450 +51173 +19816 +15163 +36804 +33383 +36498 +29529 +14041 +23527 +14687 +17229 +26800 +26413 +22147 +9450 +52830 +24639 +49517 +48152 +50283 +52594 +15371 +40367 +43886 +45651 +44733 +17024 +44066 +55340 +11711 +59444 +57575 +50352 +37853 +15395 +50645 +4514 +24351 +55141 +32242 +18352 +30645 +53462 +32026 +15911 +47859 +2325 +2040 +20152 +16447 +7861 +32277 +45087 +29190 +50412 +26261 +13543 +32734 +5702 +24248 +12978 +10042 +57447 +499 +32466 +18431 +58556 +43702 +63 +9529 +15490 +23804 +1759 +52811 +16673 +18573 +5664 +28446 +26391 +45384 +48015 +57691 +4717 +46134 +27084 +39128 +28829 +24662 +22151 +3268 +55806 +24820 +51857 +19078 +1096 +10933 +12914 +8356 +44247 +33466 +27531 +1580 +51242 +5040 +15635 +4654 +55430 +37081 +26232 +18655 +32103 +42792 +57232 +57450 +57955 +25265 +20074 +56113 +22123 +26840 +10551 +58099 +13944 +37154 +437 +19498 +55523 +31750 +19266 +27300 +2007 +22425 +4642 +1862 +45703 +18243 +45478 +16779 +57440 +19256 +9985 diff --git a/datasets/mnist/source b/datasets/mnist/source new file mode 100644 index 0000000..19ebae4 --- /dev/null +++ b/datasets/mnist/source @@ -0,0 +1 @@ +http://yann.lecun.com/exdb/mnist/ diff --git a/datasets/newsgroups/README b/datasets/newsgroups/README new file mode 100644 index 0000000..78e92ee --- /dev/null +++ b/datasets/newsgroups/README @@ -0,0 +1,7 @@ +For running the script newsgroups_extract.py we used N = 500 and topics are +'comp.graphics', 'misc.forsale', and 'sci.med'. + +The randomly generated ids in our case are in the file 'newsgroups-500-3.ids'. + +Before running the script, be sure to generate the stopwords file using +'stop.sh'. diff --git a/datasets/newsgroups/newsgroups-500-3.ids b/datasets/newsgroups/newsgroups-500-3.ids new file mode 100644 index 0000000..9b468bd --- /dev/null +++ b/datasets/newsgroups/newsgroups-500-3.ids @@ -0,0 +1,1493 @@ +38998 +38380 +38224 +38260 +38642 +38718 +38728 +39495 +38338 +38696 +38984 +39068 +39665 +38657 +38944 +38811 +38421 +38324 +38580 +38430 +38541 +37950 +38483 +37940 +38794 +37913 +39053 +38412 +38916 +39677 +39644 +38502 +38960 +38594 +39022 +38777 +38855 +38865 +38909 +38511 +38831 +38813 +38522 +38558 +38504 +38273 +37947 +38802 +38354 +38639 +39016 +38808 +38608 +38772 +38673 +38928 +38452 +38887 +38795 +38864 +38484 +38305 +38975 +38833 +37918 +38856 +38344 +37936 +39670 +38776 +38547 +39063 +38731 +38353 +39064 +38760 +38220 +38231 +38459 +38889 +39072 +38878 +37958 +38278 +39642 +39496 +38962 +38948 +37932 +38700 +38963 +38393 +38405 +38753 +39035 +39671 +38445 +39012 +38896 +38787 +38839 +38259 +38560 +38803 +38751 +38613 +38595 +38822 +38610 +38996 +38945 +38845 +38933 +38530 +38474 +38981 +38265 +37935 +37961 +38650 +38599 +38486 +38229 +38364 +39002 +38321 +38299 +38645 +38505 +39031 +39061 +38882 +38921 +38747 +39017 +38913 +38559 +39041 +39030 +38535 +38911 +38809 +38296 +39624 +38397 +38233 +38345 +38458 +38871 +38854 +38255 +39034 +38375 +38872 +38791 +38232 +38646 +38544 +38314 +38980 +38286 +38716 +38720 +38997 +38792 +38886 +38714 +38605 +38685 +38769 +38899 +38582 +38736 +38799 +38683 +39623 +38306 +38551 +38652 +38532 +39067 +38986 +38804 +38668 +38626 +38274 +38373 +38999 +38644 +38498 +38978 +38834 +39622 +38357 +38262 +38287 +37943 +38991 +38322 +38313 +38492 +37948 +39620 +38550 +38327 +38816 +38609 +39011 +38542 +38835 +39647 +38763 +38715 +39656 +38987 +38239 +38587 +38507 +38627 +39636 +38623 +38957 +38451 +39650 +39049 +39669 +39039 +38968 +38399 +37945 +38312 +39059 +39019 +38693 +38369 +38801 +38477 +39044 +38525 +38641 +39640 +38733 +38702 +39037 +38234 +38294 +38332 +37952 +38466 +37931 +38443 +40027 +39004 +38619 +38298 +38325 +38348 +38807 +38455 +38876 +38869 +38767 +38800 +38863 +38555 +38439 +38625 +38337 +38284 +38840 +38764 +38914 +38361 +38884 +38765 +38904 +38493 +38310 +38953 +38584 +37938 +38972 +38471 +38655 +38995 +38225 +38925 +38930 +38713 +38958 +38531 +38628 +39490 +38330 +39673 +39010 +38476 +38758 +38832 +38651 +38276 +39637 +38433 +38588 +38669 +38223 +38311 +38387 +38721 +39079 +38881 +38852 +39737 +39488 +38723 +38251 +38597 +38389 +37914 +37930 +38554 +38830 +38724 +38741 +38867 +38947 +38745 +38977 +37925 +39626 +39066 +38413 +37957 +38617 +38690 +37939 +38496 +38464 +38221 +38748 +38775 +38618 +38942 +38632 +38257 +38612 +38927 +38331 +38382 +38631 +38365 +38660 +38729 +38398 +38826 +38691 +38407 +38812 +38253 +39006 +38342 +39634 +37953 +38759 +38810 +39062 +39076 +37921 +38261 +38434 +39054 +39615 +38381 +39047 +38782 +38280 +38992 +39661 +39084 +39014 +38424 +38779 +38749 +38983 +38637 +38709 +38686 +39027 +38323 +37962 +38923 +38228 +38931 +37916 +38099 +38727 +37942 +38844 +38966 +38843 +38746 +38347 +38317 +39040 +38708 +38790 +38665 +39075 +38938 +38725 +37924 +38937 +38437 +38674 +38692 +38527 +38400 +39638 +38562 +38744 +38526 +39042 +38340 +38902 +38825 +39069 +38570 +38362 +38564 +38275 +38903 +38403 +38581 +38536 +38567 +38377 +38781 +38906 +38328 +38216 +38624 +38293 +38240 +38545 +38635 +38891 +38877 +38463 +38658 +38994 +39020 +38890 +38850 +38654 +39668 +38814 +38515 +39023 +38292 +39643 +38666 +38352 +37963 +38719 +38319 +38411 +39676 +37922 +38303 +38436 +38244 +38820 +38621 +38680 +38773 +38687 +38450 +38252 +37261 +38269 +38444 +38989 +39655 +38579 +38552 +38670 +38320 +38533 +38620 +39001 +38846 +38734 +38241 +38238 +39080 +38429 +38499 +38982 +38277 +39057 +59193 +58141 +59454 +59029 +59020 +59187 +59453 +59121 +59531 +59581 +59514 +59533 +59622 +58050 +59015 +58114 +58799 +59297 +58092 +58065 +59388 +59603 +59069 +59640 +59580 +58997 +59021 +59381 +59604 +59119 +59353 +59318 +58918 +59480 +59234 +58866 +58113 +59341 +59136 +59635 +59301 +58053 +58981 +58778 +59415 +58940 +59007 +58048 +59006 +59028 +59396 +58796 +59455 +59001 +59062 +59180 +59436 +59463 +59299 +59075 +59320 +59236 +58118 +58836 +59628 +58931 +58943 +59158 +58963 +59345 +58978 +59265 +58851 +59024 +59063 +58111 +59254 +58830 +58055 +59197 +58824 +59014 +59190 +58785 +58071 +58897 +59131 +59639 +58955 +59044 +59092 +59224 +58093 +59225 +59334 +59200 +59369 +58774 +58797 +59220 +59059 +58129 +59351 +59192 +58133 +59491 +58898 +59145 +59448 +58869 +59198 +58578 +59270 +58760 +59241 +59085 +58794 +58854 +59404 +58790 +58127 +59417 +59250 +58888 +58881 +59571 +59434 +59539 +59025 +59642 +58817 +59129 +58989 +59115 +58841 +59372 +59058 +58098 +59641 +59594 +59401 +59142 +58967 +58877 +59513 +58954 +58070 +59469 +59385 +59357 +59368 +59595 +58942 +59041 +59364 +59356 +59052 +59358 +58947 +59082 +59346 +59430 +58867 +59451 +58973 +58977 +59228 +58770 +59153 +59522 +59347 +59185 +59556 +59414 +58970 +59100 +59287 +58878 +58904 +59602 +59002 +59428 +59498 +58112 +58153 +58052 +58064 +59573 +59507 +59138 +59196 +58789 +58910 +59481 +58951 +58577 +59515 +59040 +58798 +59591 +58884 +59077 +59398 +59336 +59268 +59378 +59140 +58920 +58091 +59487 +59575 +59027 +59510 +59425 +59540 +59276 +59079 +59106 +58938 +59047 +59127 +59303 +59344 +59638 +58807 +58969 +59389 +58825 +59034 +59512 +59326 +59122 +59629 +59419 +59033 +58155 +58919 +59205 +58862 +59493 +58808 +58121 +58966 +59496 +59380 +59506 +59148 +58968 +59291 +58109 +59476 +58894 +58145 +58883 +59625 +58054 +59468 +58140 +58900 +59212 +58932 +59111 +59466 +59026 +59076 +59202 +59049 +58139 +59211 +58956 +59097 +59505 +59472 +59046 +59517 +59072 +59199 +59030 +59373 +59342 +59156 +59444 +58934 +59500 +58096 +59330 +58082 +59400 +59309 +59605 +59548 +59474 +58833 +59566 +59070 +58056 +59157 +59537 +58569 +58103 +59518 +59288 +59273 +59132 +58944 +58783 +58777 +58926 +58084 +58820 +59606 +58046 +59402 +58911 +59008 +59362 +59160 +58154 +58840 +59172 +59178 +58764 +59210 +59061 +59176 +58804 +59088 +58110 +59023 +58834 +59449 +58871 +58946 +59286 +59031 +59327 +59627 +58902 +59619 +58142 +58852 +58806 +59048 +58868 +58810 +59585 +59379 +59329 +58964 +58803 +59467 +59460 +59089 +59139 +59446 +58896 +59587 +59154 +59532 +59482 +59599 +59504 +58099 +59264 +58809 +59054 +58045 +58787 +59223 +59624 +59179 +59393 +59471 +59313 +59081 +58985 +59435 +58126 +59376 +59253 +59144 +58800 +59339 +58846 +59169 +59509 +59564 +59528 +59354 +59577 +59134 +59159 +58849 +58839 +59123 +58152 +59203 +59516 +58568 +58843 +59439 +59488 +59281 +59382 +59343 +59167 +58850 +59112 +58857 +59011 +59102 +58150 +59569 +58078 +58057 +59257 +58801 +59275 +58079 +59305 +59283 +58979 +58971 +59394 +59258 +59238 +58873 +59597 +59151 +59416 +58991 +58061 +59440 +59022 +59289 +59464 +58922 +59411 +58786 +59352 +58876 +58097 +59530 +58811 +58819 +58782 +58791 +58909 +59308 +58100 +59280 +58072 +59328 +58829 +58086 +58880 +59242 +59319 +58831 +59064 +59039 +58805 +59590 +59547 +59091 +58123 +59623 +59584 +59080 +59243 +59399 +58905 +58832 +59648 +58950 +58813 +59244 +59226 +59208 +59101 +59003 +58802 +58769 +58891 +59422 +59104 +59045 +58089 +59125 +58766 +59544 +59095 +59568 +59479 +59427 +58107 +59099 +76462 +76866 +76902 +76127 +75855 +76143 +76013 +74765 +76477 +76491 +76528 +76247 +75893 +76494 +76594 +76111 +75881 +75987 +76497 +76248 +74763 +75892 +76343 +76115 +75843 +76001 +75884 +74732 +76054 +76263 +76396 +76517 +76074 +76086 +76165 +76808 +76596 +75920 +76305 +75873 +76507 +76936 +76292 +76077 +76571 +75903 +74778 +76200 +76011 +75978 +76043 +76505 +76819 +74758 +76085 +76137 +75986 +76856 +76091 +76666 +76339 +76344 +76833 +76347 +75993 +76785 +76169 +76644 +75898 +76863 +76504 +76789 +75944 +75848 +75849 +75941 +76816 +76476 +76008 +76322 +76470 +76755 +74757 +76672 +75945 +74818 +75951 +76160 +76194 +76044 +75851 +75868 +75902 +74734 +74819 +75852 +76677 +76927 +76087 +76445 +76342 +75933 +76363 +75984 +76084 +76014 +76082 +76420 +76678 +74735 +74773 +76238 +76804 +75889 +75869 +76117 +76052 +76639 +76098 +76102 +76475 +76312 +76070 +76661 +76119 +74739 +76291 +74794 +76093 +76217 +76259 +76245 +76360 +75926 +75942 +74795 +76467 +76302 +76126 +76640 +74725 +76276 +76662 +76473 +75896 +74806 +75930 +75916 +76586 +74820 +76432 +75911 +76063 +74744 +76224 +76056 +75983 +76213 +76496 +76189 +76834 +74729 +76679 +76813 +76488 +76274 +76072 +76794 +75862 +76880 +76850 +76021 +76944 +74746 +74830 +75888 +76367 +75953 +75969 +76003 +74807 +76825 +76848 +76406 +76826 +76050 +76176 +76334 +76308 +76311 +76250 +76882 +76231 +75982 +76482 +75980 +75918 +76329 +75905 +76664 +76298 +76490 +76188 +75872 +76215 +76353 +76458 +76521 +76538 +76840 +76453 +76134 +74769 +76421 +76057 +76288 +76065 +76831 +76704 +76624 +76434 +76567 +76253 +76107 +74730 +76824 +76149 +76184 +74814 +76258 +76252 +76139 +76278 +76812 +76506 +76537 +76170 +76835 +76096 +75971 +74828 +76757 +76845 +75904 +75051 +76140 +76486 +76459 +74792 +76362 +76364 +77056 +74781 +76359 +74776 +76046 +76281 +76844 +76516 +74743 +76261 +76381 +74740 +76108 +76132 +76378 +76405 +76161 +76608 +76133 +76275 +76783 +74741 +76198 +76435 +75854 +76606 +76301 +76485 +74768 +76038 +75997 +76837 +76851 +76860 +75989 +76414 +76350 +76852 +76214 +76030 +74774 +76481 +75947 +76446 +76124 +76512 +76151 +76318 +76089 +76556 +76478 +76118 +76180 +74824 +74150 +76075 +76101 +76781 +76418 +76500 +76349 +76182 +76881 +76024 +76039 +76450 +76045 +76109 +76489 +76079 +76809 +76939 +74772 +76510 +76658 +76069 +76601 +75912 +76438 +76691 +76078 +76270 +76447 +76461 +76861 +75891 +76437 +76314 +74798 +76254 +76192 +76836 +76240 +76878 +74727 +76779 +74761 +76148 +76522 +75846 +76417 +75876 +76141 +76233 +76425 +76547 +76940 +76800 +76357 +76212 +76328 +76827 +76650 +76558 +76649 +76372 +76251 +76655 +74810 +76416 +75929 +75948 +76681 +75908 +76842 +75927 +75940 +75842 +75864 +76935 +75994 +76832 +76209 +76310 +76597 +75906 +76659 +76671 +76035 +76391 +74808 +76868 +76591 +74797 +76455 +76546 +76853 +76120 +76053 +76273 +76144 +76092 +75958 +76460 +76879 +76463 +76479 +75860 +75992 +76792 +75955 +76574 +76515 +75962 +76123 +76648 +76317 +74720 +74826 +76685 +76348 +75946 +76220 +76563 +76204 +76578 +76855 +76048 +76088 +76060 +76025 +76553 +75883 +76227 +75961 +76508 +76145 +76793 +76181 +74785 +74721 +76193 +76041 +76171 +76638 +76210 +76645 +76256 +75899 +76099 +76823 +75981 +74752 +76673 +76095 +76324 +75879 +75895 +76131 +76780 +76229 +76820 +74726 +76023 +76299 +76796 +76152 +76599 +76354 +76575 +76456 +76419 +76572 +76545 +76603 +76287 +76600 +76315 +76647 +76684 +76172 +76177 +74804 +76665 +76340 +77014 +76094 +76582 +76338 +74829 +74793 +76174 +76199 +74736 +76570 diff --git a/datasets/newsgroups/newsgroups_extract.py b/datasets/newsgroups/newsgroups_extract.py new file mode 100644 index 0000000..51c5030 --- /dev/null +++ b/datasets/newsgroups/newsgroups_extract.py @@ -0,0 +1,137 @@ +from sklearn.decomposition import PCA +from sklearn.feature_extraction.text import TfidfVectorizer + +import hashlib +import logging +import numpy as np +import os +import os.path +import sys +import tarfile +import wget + + +DATA_URL = "http://kdd.ics.uci.edu/databases/20newsgroups/20_newsgroups.tar.gz" +DATA_FILE = "20_newsgroups.tar.gz" +DATA_SHA256 = "b7bbf82b7831f7dbb1a09d9312f66fa78565c8de25526999b0d66f69d37e414" + + +def build_topic_corpus(corpus_file, n, topic): + logging.info("Extracting corpus for topic '{}'".format(topic)) + topic_items = [] + names = corpus_file.getnames() + for name in names: + if topic in name: + ti = corpus_file.getmember(name) + if ti.isfile(): + topic_items.append(name) + if len(topic_items) == 0: + # Topic does not exist (no items fetched) + raise ValueError(topic) + + topic_ids = [] + topic_corpus = [] + indices = np.arange(len(topic_items)) + np.random.shuffle(indices) + indices = indices[:n] + for i in indices: + ti = corpus_file.getmember(topic_items[i]) + with corpus_file.extractfile(ti) as f: + try: + contents = str(f.read(), encoding="utf8") + except ValueError as e: + logging.warn("Encoding error in '{}': {}".format(ti.name, e)) + continue + _, item_id = os.path.split(ti.name) + topic_ids.append(item_id) + topic_corpus.append(contents) + + return topic_ids, topic_corpus + + +def build_corpus(n, topics): + """ + Builds a corpus with each topic, with N items each. + Returns a list of document IDs and a corpus which is a dict where each topic + is a key mapped to a list of document contents. + """ + ids = [] + corpus = dict() + with tarfile.open(DATA_FILE, "r:gz") as f: + for topic in topics: + topic_ids, topic_corpus = build_topic_corpus(f, n, topic) + corpus[topic] = topic_corpus + ids.extend(topic_ids) + return ids, corpus + + +if __name__ == "__main__": + if len(sys.argv) < 4: + print("usage: {} STOP_WORDS N TOPIC [ TOPIC [ ... ] ]".format(sys.argv[0])) + print("The program reads the file STOP_WORDS for stop words, extracts" + + " and generates a BoW model from N random articles of each TOPIC") + exit(1) + + logging.basicConfig(filename="newsgroups_extract.log", + format="%(levelname)s:%(message)s", + level=logging.INFO) + + if not os.path.exists(DATA_FILE): + logging.info("Downloading data from '{}'".format(DATA_URL)) + wget.download(DATA_URL, DATA_FILE) + with open(DATA_FILE, "rb") as f: + if not hashlib.sha256(f.read()).hexdigest() != DATA_SHA256: + logging.error("'{}' is corrupted; aborting".format(DATA_FILE)) + exit(1) + + # Read stop words list + try: + with open(sys.argv[1]) as stop_words_file: + stop_words = stop_words_file.read().split() + except Exception as e: + logging.error("Could not read stop words: {}".format(e)) + exit(1) + + try: + n = int(sys.argv[2]) + if (n < 2) or (n > 1000): + raise ValueError("N must be between 2 and 1000") + except ValueError as e: + logging.error("Invalid argument: {}".format(e)) + exit(1) + + # Extract text corpus from tarball + logging.info("Building corpus") + topics = sys.argv[3:] + try: + ids, corpus = build_corpus(n, topics) + except ValueError as e: + logging.error("Invalid topic: {}".format(e)) + exit(1) + + corpus_text = [] + for topic_items in corpus.values(): + corpus_text.extend(topic_items) + + # Compute the TF-IDF matrix + logging.info("Computing TF-IDF matrix") + vectorizer = TfidfVectorizer(min_df=0.01, stop_words=stop_words) + X = vectorizer.fit_transform(corpus_text) + + # Reduce data dimensionality using PCA + logging.info("Computing PCA and reducing to 512 dimensions") + X = PCA(n_components=512, whiten=True).fit_transform(X.toarray()) + + # Save all extracted features and related data + logging.info("Writing IDs file") + ids_fname = "newsgroups-{}-{}.ids".format(n, len(topics)) + np.savetxt(ids_fname, ids, fmt="%s") + + logging.info("Writing table file") + tbl_fname = "newsgroups-{}-{}.tbl".format(n, len(topics)) + np.savetxt(tbl_fname, X.todense(), fmt="%f") + + logging.info("Writing labels file") + labels_fname = "newsgroups-{}-{}.labels".format(n, len(topics)) + counts = [len(topic_items) for topic_items in corpus.values()] + np.savetxt(labels_fname, np.repeat(topics, counts), fmt="%s") diff --git a/datasets/newsgroups/source b/datasets/newsgroups/source new file mode 100644 index 0000000..764f792 --- /dev/null +++ b/datasets/newsgroups/source @@ -0,0 +1 @@ +http://kdd.ics.uci.edu/databases/20newsgroups/20newsgroups.html diff --git a/datasets/newsgroups/stop.sh b/datasets/newsgroups/stop.sh new file mode 100644 index 0000000..36a5f74 --- /dev/null +++ b/datasets/newsgroups/stop.sh @@ -0,0 +1,12 @@ +# stop.sh +# +# Generate proper stop words list from the 'stop.txt' file. + + +# Original source: http://snowball.tartarus.org/algorithms/english/stop.txt +# NOTE: in our experiments, stop.txt has been modified to include the last stop +# words (stop.txt is included). + +sed 's/|.*//g' <stop.txt \ + | sed 's/ \+//g' \ + | sed '/^$/d' >words.txt diff --git a/datasets/newsgroups/stop.txt b/datasets/newsgroups/stop.txt new file mode 100644 index 0000000..5d0a34b --- /dev/null +++ b/datasets/newsgroups/stop.txt @@ -0,0 +1,310 @@ + + | An English stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | Many of the forms below are quite rare (e.g. "yourselves") but included for + | completeness. + + | PRONOUNS FORMS + | 1st person sing + +i | subject, always in upper case of course + +me | object +my | possessive adjective + | the possessive pronoun `mine' is best suppressed, because of the + | sense of coal-mine etc. +myself | reflexive + | 1st person plural +we | subject + +| us | object + | care is required here because US = United States. It is usually + | safe to remove it if it is in lower case. +our | possessive adjective +ours | possessive pronoun +ourselves | reflexive + | second person (archaic `thou' forms not included) +you | subject and object +your | possessive adjective +yours | possessive pronoun +yourself | reflexive (singular) +yourselves | reflexive (plural) + | third person singular +he | subject +him | object +his | possessive adjective and pronoun +himself | reflexive + +she | subject +her | object and possessive adjective +hers | possessive pronoun +herself | reflexive + +it | subject and object +its | possessive adjective +itself | reflexive + | third person plural +they | subject +them | object +their | possessive adjective +theirs | possessive pronoun +themselves | reflexive + | other forms (demonstratives, interrogatives) +what +which +who +whom +this +that +these +those + + | VERB FORMS (using F.R. Palmer's nomenclature) + | BE +am | 1st person, present +is | -s form (3rd person, present) +are | present +was | 1st person, past +were | past +be | infinitive +been | past participle +being | -ing form + | HAVE +have | simple +has | -s form +had | past +having | -ing form + | DO +do | simple +does | -s form +did | past +doing | -ing form + + | The forms below are, I believe, best omitted, because of the significant + | homonym forms: + + | He made a WILL + | old tin CAN + | merry month of MAY + | a smell of MUST + | fight the good fight with all thy MIGHT + + | would, could, should, ought might however be included + + | | AUXILIARIES + | | WILL + |will + +would + + | | SHALL + |shall + +should + + | | CAN + |can + +could + + | | MAY + |may + |might + | | MUST + |must + | | OUGHT + +ought + + | COMPOUND FORMS, increasingly encountered nowadays in 'formal' writing + | pronoun + verb + +i'm +you're +he's +she's +it's +we're +they're +i've +you've +we've +they've +i'd +you'd +he'd +she'd +we'd +they'd +i'll +you'll +he'll +she'll +we'll +they'll + + | verb + negation + +isn't +aren't +wasn't +weren't +hasn't +haven't +hadn't +doesn't +don't +didn't + + | auxiliary + negation + +won't +wouldn't +shan't +shouldn't +can't +cannot +couldn't +mustn't + + | miscellaneous forms + +let's +that's +who's +what's +here's +there's +when's +where's +why's +how's + + | rarer forms + + | daren't needn't + + | doubtful forms + + | oughtn't mightn't + + | ARTICLES +a +an +the + + | THE REST (Overlap among prepositions, conjunctions, adverbs etc is so + | high, that classification is pointless.) +and +but +if +or +because +as +until +while + +of +at +by +for +with +about +against +between +into +through +during +before +after +above +below +to +from +up +down +in +out +on +off +over +under + +again +further +then +once + +here +there +when +where +why +how + +all +any +both +each +few +more +most +other +some +such + +no +nor +not +only +own +same +so +than +too +very + +one +every +least +less +many +now +ever +never +say +says +said +also +get +go +goes +just +made +make +put +see +seen +whether +like +well +back +even +still +way +take +since +another +however +two +three +four +five +first +second +new +old +high +long + diff --git a/datasets/newsgroups/words.txt b/datasets/newsgroups/words.txt new file mode 100644 index 0000000..0d11300 --- /dev/null +++ b/datasets/newsgroups/words.txt @@ -0,0 +1,216 @@ +i +me +my +myself +we +our +ours +ourselves +you +your +yours +yourself +yourselves +he +him +his +himself +she +her +hers +herself +it +its +itself +they +them +their +theirs +themselves +what +which +who +whom +this +that +these +those +am +is +are +was +were +be +been +being +have +has +had +having +do +does +did +doing +would +should +could +ought +i'm +you're +he's +she's +it's +we're +they're +i've +you've +we've +they've +i'd +you'd +he'd +she'd +we'd +they'd +i'll +you'll +he'll +she'll +we'll +they'll +isn't +aren't +wasn't +weren't +hasn't +haven't +hadn't +doesn't +don't +didn't +won't +wouldn't +shan't +shouldn't +can't +cannot +couldn't +mustn't +let's +that's +who's +what's +here's +there's +when's +where's +why's +how's +a +an +the +and +but +if +or +because +as +until +while +of +at +by +for +with +about +against +between +into +through +during +before +after +above +below +to +from +up +down +in +out +on +off +over +under +again +further +then +once +here +there +when +where +why +how +all +any +both +each +few +more +most +other +some +such +no +nor +not +only +own +same +so +than +too +very +one +every +least +less +many +now +ever +never +say +says +said +also +get +go +goes +just +made +make +put +see +seen +whether +like +well +back +even +still +way +take +since +another +however +two +three +four +five +first +second +new +old +high +long diff --git a/datasets/segmentation/segmentation_extract.py b/datasets/segmentation/segmentation_extract.py new file mode 100644 index 0000000..e621161 --- /dev/null +++ b/datasets/segmentation/segmentation_extract.py @@ -0,0 +1,39 @@ +import hashlib +import logging +import pandas as pd +import os +import os.path +import wget + + +DATA_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/image/segmentation.test" +DATA_SHA256 = "2e9e966479d54c6aaec309059376dd9c89c1b46bf3a23aceeefb36d20d93a189" +DATA_FILE = "segmentation.test" + + +if __name__ == "__main__": + logging.basicConfig(filename="segmentation_extract.log", + format="%(levelname)s:%(message)s", + level=logging.INFO) + + if not os.path.exists(DATA_FILE): + logging.info("Downloading '{}'".format(DATA_URL)) + wget.download(DATA_URL, DATA_FILE) + with open(DATA_FILE, "rb") as f: + if hashlib.sha256(f.read()).hexdigest() != DATA_SHA256: + logging.error("{} is corrupted; aborting".format(DATA_FILE)) + + + df = pd.read_table(DATA_FILE, header=None, skiprows=4, delimiter=",") + + # First column contains class names, which we convert to numbers using the + # 'class_labels' dict + classes = set(df[0]) + numbers = [i for i in range(len(classes))] + class_labels = dict(zip(classes, numbers)) + + data = df.drop([0, 3], axis=1) + data.to_csv("segmentation.tbl", sep=" ", index=False, header=False) + + labels = df[0].apply(lambda x: class_labels[x]) + labels.to_csv("segmentation.labels", sep=" ", index=False, header=False) diff --git a/datasets/segmentation/source b/datasets/segmentation/source new file mode 100644 index 0000000..ab98436 --- /dev/null +++ b/datasets/segmentation/source @@ -0,0 +1 @@ +https://archive.ics.uci.edu/ml/datasets/Image+Segmentation diff --git a/datasets/wdbc/source b/datasets/wdbc/source new file mode 100644 index 0000000..67d201a --- /dev/null +++ b/datasets/wdbc/source @@ -0,0 +1 @@ +http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/ diff --git a/datasets/wdbc/wdbc_extract.py b/datasets/wdbc/wdbc_extract.py new file mode 100644 index 0000000..9b6b84a --- /dev/null +++ b/datasets/wdbc/wdbc_extract.py @@ -0,0 +1,34 @@ +import hashlib +import logging +import pandas as pd +import os +import os.path +import wget + + +DATA_URL = "http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data" +DATA_SHA256 = "d606af411f3e5be8a317a5a8b652b425aaf0ff38ca683d5327ffff94c3695f4a" +DATA_FILE = "wdbc.data" + + +if __name__ == "__main__": + logging.basicConfig(filename="wdbc_extract.log", + format="%(levelname)s:%(message)s", + level=logging.INFO) + + if not os.path.exists(DATA_FILE): + logging.info("Downloading '{}".format(DATA_URL)) + wget.download(DATA_URL, DATA_FILE) + with open(DATA_FILE, "rb") as f: + if hashlib.sha256(f.read()).hexdigest() != DATA_SHA256: + logging.error("'{}' is corrupted; aborting".format(DATA_FILE)) + exit(1) + + data = pd.read_table(DATA_FILE, header=None, delimiter=",") + wdbc_ids = data[0] + wdbc_labels = data[1] + wdbc = data.drop([0, 1], axis=1) + + wdbc.to_csv("wdbc.tbl", sep=" ", index=False, header=False) + wdbc_labels.to_csv("wdbc.labels", sep=" ", index=False, header=False) + wdbc_ids.to_csv("wdbc.ids", sep=" ", index=False, header=False) |