aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSamuel Fadel <samuelfadel@gmail.com>2016-08-19 14:20:57 -0300
committerSamuel Fadel <samuelfadel@gmail.com>2016-08-19 14:20:57 -0300
commitb255338295587246292dc978e7d4d5687ee01fb4 (patch)
tree1581b76a03f4929c5132dcb3c6920fa761f8261c
parentfbf8d82cdd3720c4bbf2a94035b6779e56d73448 (diff)
Scripts and other files for building all datasets.
-rw-r--r--datasets/faces/faces_extract.py81
-rw-r--r--datasets/faces/source1
-rw-r--r--datasets/mnist/mnist_extract.py148
-rw-r--r--datasets/mnist/mnist_test_sample.tbl1000
-rw-r--r--datasets/mnist/mnist_train_sample.tbl1000
-rw-r--r--datasets/mnist/source1
-rw-r--r--datasets/newsgroups/README7
-rw-r--r--datasets/newsgroups/newsgroups-500-3.ids1493
-rw-r--r--datasets/newsgroups/newsgroups_extract.py137
-rw-r--r--datasets/newsgroups/source1
-rw-r--r--datasets/newsgroups/stop.sh12
-rw-r--r--datasets/newsgroups/stop.txt310
-rw-r--r--datasets/newsgroups/words.txt216
-rw-r--r--datasets/segmentation/segmentation_extract.py39
-rw-r--r--datasets/segmentation/source1
-rw-r--r--datasets/wdbc/source1
-rw-r--r--datasets/wdbc/wdbc_extract.py34
17 files changed, 4482 insertions, 0 deletions
diff --git a/datasets/faces/faces_extract.py b/datasets/faces/faces_extract.py
new file mode 100644
index 0000000..3e8b4f3
--- /dev/null
+++ b/datasets/faces/faces_extract.py
@@ -0,0 +1,81 @@
+from scipy.io import loadmat
+from scipy.misc import imsave
+from sklearn.decomposition import PCA
+
+import hashlib
+import logging
+import numpy as np
+import os
+import os.path
+import sklearn.decomposition
+import subprocess
+import wget
+
+
+# Original data
+DATA_URL = "http://isomap.stanford.edu/face_data.mat.Z"
+SHA256_DIGEST = "9c5bc75f204071bbd340aa3ff584757ec784b0630206e526d4cd3809f2650a8a"
+
+# Local name
+DATA_FNAME = "face_data.mat"
+
+# Output files/directories
+IMG_DIR = "images"
+IMG_FNAME = "face_raw.tbl"
+LIGHTS_FNAME = "face_lights.tbl"
+POSES_FNAME = "face_poses.tbl"
+PCA_FNAME = "faces.tbl"
+
+
+if __name__ == "__main__":
+ logging.basicConfig(filename="faces_extract.log",
+ format="%(levelname)s:%(message)s",
+ level=logging.INFO)
+
+ # Get original data
+ if not os.path.exists(DATA_FNAME):
+ if not os.path.exists("{}.Z".format(DATA_FNAME)):
+ logging.info("Downloading faces data from '{}'".format(DATA_URL))
+ wget.download(DATA_URL, "{}.Z".format(DATA_FNAME))
+
+ logging.info("Checking SHA-1 digest")
+ with open("{}.Z".format(DATA_FNAME), "rb") as f:
+ if hashlib.sha256(f.read()).hexdigest() != SHA256_DIGEST:
+ logging.error("File seems corrupted; aborting")
+ exit(1)
+
+ logging.info("Uncompressing data into '{}'".format(DATA_FNAME))
+ subprocess.call(["uncompress", "{}.Z".format(DATA_FNAME)])
+
+ # We have the original data; proceed
+ logging.info("Loading faces data")
+ faces = loadmat(DATA_FNAME)
+
+ face_images = faces["images"]
+ logging.info("Writing image table data to {}".format(IMG_FNAME))
+ np.savetxt(IMG_FNAME, face_images.T, fmt="%f")
+
+ if not os.path.exists(IMG_DIR):
+ logging.info("Creating directory {}".format(IMG_DIR))
+ os.makedirs(IMG_DIR, 0o755)
+ elif not os.path.isdir(IMG_DIR):
+ logging.error("File {} exists; aborting".format(IMG_DIR))
+ exit(1)
+
+ logging.info("Writing image files to {}".format(IMG_DIR))
+ for i in range(face_images.shape[1]):
+ image = face_images[:, i]
+ image = image.reshape(64, 64).T
+ path = os.path.join(IMG_DIR, "{}.png".format(i))
+ imsave(path, image)
+
+ logging.info("Writing lights data to {}".format(LIGHTS_FNAME))
+ np.savetxt(LIGHTS_FNAME, faces["lights"].T, fmt="%f")
+
+ logging.info("Writing poses data to {}".format(POSES_FNAME))
+ np.savetxt(POSES_FNAME, faces["poses"].T, fmt="%f")
+
+ logging.info("Writing PCA-whitened data to {}".format(PCA_FNAME))
+ X = faces["images"].T
+ X = PCA(n_components=256, whiten=True).fit_transform(X)
+ np.savetxt(PCA_FNAME, X, fmt="%f")
diff --git a/datasets/faces/source b/datasets/faces/source
new file mode 100644
index 0000000..e89da9b
--- /dev/null
+++ b/datasets/faces/source
@@ -0,0 +1 @@
+http://isomap.stanford.edu/datasets.html
diff --git a/datasets/mnist/mnist_extract.py b/datasets/mnist/mnist_extract.py
new file mode 100644
index 0000000..403b250
--- /dev/null
+++ b/datasets/mnist/mnist_extract.py
@@ -0,0 +1,148 @@
+from array import array as pyarray
+from scipy.io import loadmat
+from sklearn.decomposition import PCA
+
+import gzip
+import hashlib
+import logging
+import numpy as np
+import os
+import os.path
+import struct
+import sys
+import wget
+
+
+TRAIN_IMAGES_URL = "http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz"
+TRAIN_LABELS_URL = "http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz"
+TEST_IMAGES_URL = "http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz"
+TEST_LABELS_URL = "http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz"
+
+TRAIN_IMAGES_SHA256 = "440fcabf73cc546fa21475e81ea370265605f56be210a4024d2ca8f203523609"
+TRAIN_LABELS_SHA256 = "3552534a0a558bbed6aed32b30c495cca23d567ec52cac8be1a0730e8010255c"
+TEST_IMAGES_SHA256 = "8d422c7b0a1c1c79245a5bcf07fe86e33eeafee792b84584aec276f5a2dbc4e6"
+TEST_LABELS_SHA256 = "f7ae60f92e00ec6debd23a6088c31dbd2371eca3ffa0defaefb259924204aec6"
+
+TRAIN_SAMPLE_INDICES_FNAME = "mnist_train_sample.tbl"
+TEST_SAMPLE_INDICES_FNAME = "mnist_test_sample.tbl"
+
+FNAME_IMG = {
+ 'train': 'train-images-idx3-ubyte.gz',
+ 'test': 't10k-images-idx3-ubyte.gz'
+}
+
+FNAME_LBL = {
+ 'train': 'train-labels-idx1-ubyte.gz',
+ 'test': 't10k-labels-idx1-ubyte.gz'
+}
+
+
+def download_and_check(in_url, out_fname, sha256sum):
+ logging.info("Downloading '{}'".format(in_url))
+ wget.download(in_url, out_fname)
+
+ valid = False
+ with open(out_fname, "rb") as f:
+ valid = (hashlib.sha256(f.read()).hexdigest() == sha256sum)
+
+ return valid
+
+
+def load_mnist(data="train", digits=np.arange(10)):
+ fname_img = FNAME_IMG[data]
+ fname_lbl = FNAME_LBL[data]
+
+ with gzip.open(fname_lbl, 'rb') as flbl:
+ magic_nr, size = struct.unpack(">II", flbl.read(8))
+ lbl = pyarray("b", flbl.read())
+
+ with gzip.open(fname_img, 'rb') as fimg:
+ magic_nr, size, rows, cols = struct.unpack(">IIII", fimg.read(16))
+ img = pyarray("B", fimg.read())
+
+ ind = [k for k in range(size) if lbl[k] in digits]
+ N = len(ind)
+
+ images = np.zeros((N, rows*cols), dtype=np.uint8)
+ labels = np.zeros((N, 1), dtype=np.int8)
+ for i in range(len(ind)):
+ m = ind[i]*rows*cols
+ n = (ind[i]+1)*rows*cols
+ images[i] = np.array(img[m:n])
+ labels[i] = lbl[ind[i]]
+
+ return images, labels
+
+
+if __name__ == "__main__":
+ logging.basicConfig(filename="mnist_extract.log",
+ format="%(levelname)s:%(message)s",
+ level=logging.INFO)
+
+ # Get and check original data if needed
+ urls = [TRAIN_IMAGES_URL, TRAIN_LABELS_URL,
+ TEST_IMAGES_URL, TEST_LABELS_URL]
+ fnames = [FNAME_IMG['train'], FNAME_LBL['train'],
+ FNAME_IMG['test'], FNAME_LBL['test']]
+ sha256sums = [TRAIN_IMAGES_SHA256, TRAIN_LABELS_SHA256,
+ TEST_IMAGES_SHA256, TEST_LABELS_SHA256]
+ for url, fname, sha256sum in zip(urls, fnames, sha256sums):
+ if not os.path.exists(fname):
+ ok = download_and_check(url, fname, sha256sum)
+ if not ok:
+ logging.error("'{}' is corrupted; aborting".format(fname))
+ exit(1)
+
+ # We now have the original data
+ logging.info("Loading MNIST training data")
+ mnist_train = dict()
+ mnist_train['train_X'], mnist_train['train_labels'] = load_mnist("train")
+ train_size = mnist_train['train_X'].shape[0]
+
+ logging.info("Loading MNIST test data")
+ mnist_test = dict()
+ mnist_test['test_X'], mnist_test['test_labels'] = load_mnist("test")
+ test_size = mnist_test['test_X'].shape[0]
+
+ should_load_samples = False
+ if len(sys.argv) == 2 \
+ or (not os.path.exists(TRAIN_SAMPLE_INDICES_FNAME)) \
+ or (not os.path.exists(TEST_SAMPLE_INDICES_FNAME)):
+ sample_size = int(sys.argv[1])
+
+ if sample_size/2 > min(train_size, test_size):
+ print("sample size is too large")
+ should_load_samples = True
+ else:
+ logging.info("Generating {} samples".format(sample_size))
+ train_sample_indices = np.randint(0, train_size, sample_size / 2)
+ test_sample_indices = np.randint(0, test_size, sample_size / 2)
+
+ logging.info("Saving generated samples")
+ np.savetxt("mnist_train_sample.tbl", train_sample_indices, fmt="%u")
+ np.savetxt("mnist_test_sample.tbl", test_sample_indices, fmt="%u")
+ else:
+ should_load_samples = True
+
+ if should_load_samples:
+ logging.info("Loading samples")
+ train_sample_indices = np.loadtxt(TRAIN_SAMPLE_INDICES_FNAME, dtype=int)
+ test_sample_indices = np.loadtxt(TEST_SAMPLE_INDICES_FNAME, dtype=int)
+ sample_size = train_sample_indices.shape[0] \
+ + test_sample_indices.shape[0]
+
+ logging.info("Extracting {} samples".format(sample_size))
+ train_samples = mnist_train['train_X'][train_sample_indices, :]
+ test_samples = mnist_test['test_X'][test_sample_indices, :]
+ mnist_sample = np.concatenate((train_samples, test_samples))
+ mnist_sample = PCA(n_components=512, whiten=True).fit_transform(mnist_sample)
+
+ train_labels = mnist_train['train_labels'][train_sample_indices]
+ test_labels = mnist_test['test_labels'][test_sample_indices]
+ mnist_sample_labels = np.concatenate((train_labels, test_labels))
+
+ logging.info("Saving extracted samples and their labels")
+ sample_fname = "mnist_{}.tbl".format(sample_size)
+ labels_fname = "mnist_{}.labels".format(sample_size)
+ np.savetxt(sample_fname, mnist_sample, fmt="%f")
+ np.savetxt(labels_fname, mnist_sample_labels, fmt="%u")
diff --git a/datasets/mnist/mnist_test_sample.tbl b/datasets/mnist/mnist_test_sample.tbl
new file mode 100644
index 0000000..01dbdb4
--- /dev/null
+++ b/datasets/mnist/mnist_test_sample.tbl
@@ -0,0 +1,1000 @@
+3973
+5619
+2531
+173
+2631
+2965
+3560
+4015
+8898
+4444
+1512
+6548
+2198
+6312
+1296
+7560
+5442
+1117
+7187
+3760
+6127
+8284
+5753
+1355
+4525
+7358
+9941
+2250
+3971
+3997
+8080
+1439
+9378
+9748
+3982
+9763
+2350
+1694
+9278
+2478
+4600
+2931
+1877
+2024
+1665
+4573
+9992
+4573
+2805
+3637
+4888
+393
+6974
+342
+1428
+7768
+8996
+6218
+8090
+3170
+9294
+3762
+3572
+6668
+4507
+8653
+7773
+7401
+1864
+8081
+8926
+1423
+2438
+2299
+7662
+7016
+8325
+4795
+2827
+1549
+6415
+1814
+1587
+8541
+3768
+6314
+520
+3211
+3042
+7988
+2437
+8701
+7152
+6116
+6788
+1716
+2574
+222
+2507
+7909
+9310
+2515
+9552
+5978
+2747
+8850
+5236
+7072
+9609
+1773
+1996
+7011
+6411
+6782
+7557
+8347
+8594
+1019
+3593
+7665
+1496
+9937
+8440
+2913
+3808
+6796
+4860
+8834
+8010
+1539
+9196
+8876
+9689
+5137
+1135
+2322
+8701
+7240
+5049
+7570
+6400
+3681
+1554
+9645
+8644
+899
+6195
+4824
+8461
+3274
+7104
+9782
+1217
+4177
+3767
+3524
+6676
+5271
+4171
+1631
+1146
+218
+9407
+3410
+6100
+3513
+4910
+2599
+5342
+9572
+2431
+4692
+7466
+4627
+4954
+614
+1599
+4993
+9468
+1112
+5396
+571
+6039
+7624
+2345
+3136
+7770
+968
+7956
+2759
+1176
+4412
+4011
+8072
+6664
+3271
+2622
+1225
+3539
+3123
+4995
+4203
+8476
+5421
+4099
+7507
+4915
+5903
+2902
+3787
+9845
+626
+5494
+7632
+3265
+2735
+4421
+7779
+1584
+9824
+6218
+6274
+1646
+7492
+9262
+3954
+7870
+5170
+4843
+1354
+6864
+4077
+9041
+7851
+554
+1383
+5010
+83
+2836
+8388
+3615
+6271
+3648
+5977
+6805
+1980
+5002
+3512
+2502
+7348
+5785
+3173
+7501
+5926
+4718
+7266
+6330
+837
+5880
+4539
+1666
+2744
+6435
+8204
+2600
+9381
+7991
+2922
+8758
+3510
+2159
+5589
+9240
+9164
+3487
+82
+8549
+8142
+5323
+2833
+5876
+8804
+5384
+6427
+9445
+8258
+4534
+1401
+4012
+1963
+8864
+1296
+1971
+6865
+5059
+9953
+9929
+3362
+3947
+2774
+7999
+7872
+8911
+8231
+7525
+7673
+9513
+4506
+6972
+4863
+9817
+7406
+2814
+5024
+1235
+6361
+9599
+7879
+3188
+3635
+3510
+3144
+619
+7330
+9709
+9059
+6732
+1336
+6753
+5323
+3225
+7434
+1248
+3561
+9789
+584
+3547
+5062
+684
+683
+9749
+1890
+3295
+3922
+2478
+9084
+2081
+5425
+4698
+2516
+9604
+966
+5215
+3516
+4565
+4994
+9493
+4670
+870
+4205
+2392
+3959
+5435
+8656
+4656
+4864
+5566
+36
+4241
+952
+8403
+8439
+927
+2054
+6004
+9119
+7485
+7909
+4296
+4955
+2010
+1665
+1517
+284
+6760
+6873
+7480
+4833
+9706
+6572
+4516
+7929
+3593
+599
+9033
+7960
+1294
+8328
+2060
+3968
+9377
+6773
+4262
+8589
+6944
+5036
+1168
+3302
+9869
+3937
+6549
+9227
+490
+1144
+8171
+168
+3615
+177
+6932
+1850
+2162
+1383
+9091
+6706
+2185
+378
+8364
+5087
+4923
+8251
+4509
+6684
+5531
+7363
+9608
+2523
+6755
+9274
+3125
+7483
+1757
+5886
+5546
+9232
+7339
+1532
+6687
+2077
+151
+9142
+8634
+4382
+5496
+6438
+190
+8095
+1044
+1185
+5477
+2091
+9776
+9808
+7624
+8804
+5577
+4743
+9430
+9559
+8800
+5013
+6185
+5485
+3654
+6515
+2616
+6941
+8565
+774
+123
+1785
+1295
+5907
+6938
+3834
+8451
+3349
+9035
+8060
+5346
+130
+9765
+2271
+1931
+6623
+5083
+1360
+9246
+418
+9861
+7503
+2337
+9378
+8950
+1058
+9388
+6561
+9052
+2369
+9216
+4447
+3863
+4109
+9174
+4578
+7114
+7682
+1292
+9262
+4037
+9389
+977
+5536
+6542
+4658
+6248
+5812
+5368
+1902
+2241
+3998
+8438
+2537
+7300
+324
+6035
+705
+6183
+6218
+7344
+9178
+5217
+9838
+6091
+7340
+2628
+4193
+3754
+953
+7502
+3565
+4096
+4102
+5243
+2539
+6210
+4066
+9885
+4510
+1014
+6225
+4623
+4070
+9637
+7889
+2956
+7615
+6248
+4692
+9723
+7971
+6075
+9411
+7988
+9494
+2855
+1128
+4452
+2287
+6406
+9620
+3490
+6300
+4194
+976
+4855
+5826
+2230
+8451
+4355
+1056
+5916
+6232
+6178
+7111
+8189
+8229
+4582
+4987
+2504
+5209
+9065
+542
+9704
+5142
+8361
+650
+7443
+9784
+827
+3290
+5941
+7560
+1708
+7139
+9780
+4061
+2787
+1098
+2374
+4307
+474
+2027
+5593
+5711
+7421
+877
+4290
+449
+6847
+5852
+5325
+4446
+265
+2754
+1522
+251
+8177
+8549
+2520
+4924
+6980
+5069
+8282
+3407
+4092
+31
+8512
+2922
+718
+11
+8929
+8633
+543
+3798
+4298
+1608
+5033
+7498
+7880
+8875
+9322
+4915
+4635
+211
+8764
+4584
+3430
+4709
+3673
+4644
+5422
+1496
+1302
+7828
+9634
+9829
+8080
+5497
+7544
+4022
+7035
+9211
+8835
+3730
+3572
+7233
+9559
+8775
+275
+7197
+3020
+4200
+2802
+2257
+3467
+1800
+3972
+416
+5186
+6074
+7911
+6703
+6512
+9216
+771
+4106
+994
+4621
+5264
+4301
+4850
+7139
+2845
+3084
+9974
+5940
+5407
+1302
+6306
+4011
+4588
+4830
+8517
+5363
+1698
+5259
+4667
+4592
+1327
+4744
+5258
+8023
+1070
+8483
+9563
+2784
+7275
+9539
+633
+1612
+2249
+584
+1400
+1746
+4130
+6278
+4132
+1817
+5721
+6304
+7754
+6763
+4966
+4103
+350
+1445
+5864
+5294
+498
+6512
+6985
+1344
+6528
+9555
+6924
+3436
+2995
+4486
+7545
+2166
+1347
+5680
+8737
+2049
+5923
+6491
+8691
+7240
+4321
+9902
+1352
+4586
+6870
+9504
+4282
+814
+5203
+9037
+6233
+4286
+1500
+2867
+411
+2915
+804
+8028
+775
+6782
+9628
+6657
+5371
+8703
+9310
+9727
+3054
+3224
+4925
+6034
+8457
+255
+7549
+8916
+9338
+4516
+6693
+2317
+3229
+5808
+7849
+5907
+6960
+9521
+4025
+6247
+1126
+8732
+2250
+6327
+339
+8920
+532
+6762
+6966
+202
+2118
+4469
+1402
+2608
+3524
+8060
+6309
+2867
+2961
+6588
+2806
+4728
+7695
+3697
+7446
+1861
+3047
+6188
+1553
+7946
+5868
+4599
+1290
+3645
+4893
+7128
+544
+5489
+2054
+5743
+1459
+4830
+6090
+3792
+7530
+1310
+4238
+8773
+2622
+2441
+4483
+4987
+4
+1785
+3994
+8562
+9479
+8214
+3025
+5599
+5541
+3878
+1022
+516
+2749
+947
+9004
+945
+999
+1732
+6760
+3753
+408
+8403
+8616
+8808
+2863
+6291
+2927
+5566
+2551
+6242
+973
+3016
+9882
+4616
+5908
+9717
+494
+6446
+3305
+8158
+957
+8490
+8972
+1272
+700
+6195
+6060
+6416
+5052
+909
+3195
+1410
+8089
+4350
+1384
+568
+2457
+9287
+4669
+5095
+3792
+587
+3252
+2553
+7829
+4747
+387
+5560
+2238
+7800
+7076
+9380
+1121
+7393
+876
+8840
+99
+5336
+4206
+1363
+3166
+4376
+2
+5769
+6015
+3537
+8225
+0
+2105
+3266
+5232
+6300
+1668
+7178
+2302
+9710
+4617
+6516
+4722
+9427
+9090
+3217
+4941
+1897
+7119
+467
+5375
+3561
+418
+4564
+1775
+5244
+1708
+3234
+2402
+7947
+9803
+7822
+151
+8200
+510
+2616
+8291
+5053
diff --git a/datasets/mnist/mnist_train_sample.tbl b/datasets/mnist/mnist_train_sample.tbl
new file mode 100644
index 0000000..0b136db
--- /dev/null
+++ b/datasets/mnist/mnist_train_sample.tbl
@@ -0,0 +1,1000 @@
+57616
+17592
+35740
+20022
+18918
+6515
+47576
+21464
+314
+49504
+44255
+274
+12270
+18426
+47098
+15671
+21746
+15811
+55537
+55389
+13571
+34863
+28102
+37300
+23374
+9773
+45834
+43443
+13699
+32778
+48444
+27905
+39405
+31994
+35449
+2653
+39538
+31644
+55621
+49589
+33400
+48087
+37060
+5615
+24531
+29602
+13300
+40262
+19419
+22407
+59794
+59057
+22781
+58471
+44220
+7522
+51504
+20731
+40111
+23730
+18806
+2268
+48366
+59118
+10441
+52308
+3980
+20045
+43691
+7395
+46980
+31485
+11893
+29599
+59330
+15598
+18435
+30758
+2064
+57422
+7182
+19072
+23953
+38725
+41781
+38145
+16674
+31199
+34283
+36525
+14807
+22142
+38126
+2029
+26729
+12707
+20955
+8202
+41186
+55210
+46653
+53225
+13980
+36439
+38367
+19888
+40442
+22213
+10670
+15121
+33148
+38950
+20949
+47570
+7746
+34761
+34571
+57804
+2342
+44979
+45321
+48538
+31397
+58603
+40334
+11865
+4047
+38967
+41588
+12377
+17679
+59706
+43049
+1111
+47047
+55831
+5013
+20843
+25249
+43546
+39075
+7040
+40238
+55874
+40866
+17630
+40943
+45366
+32838
+7111
+31456
+20075
+57155
+51424
+45547
+55680
+20006
+49687
+51880
+2478
+59925
+8823
+35400
+6391
+6317
+32393
+18334
+33229
+52966
+610
+54534
+4053
+21797
+22954
+57184
+221
+782
+32494
+41319
+43098
+19062
+6984
+6527
+7383
+18878
+50131
+7372
+7476
+13935
+38478
+17870
+23977
+49675
+29469
+40345
+46503
+52574
+54370
+3183
+43099
+32347
+28495
+59102
+30434
+55341
+19146
+39833
+15710
+4594
+8962
+52262
+25213
+13208
+31540
+14932
+10167
+30292
+5674
+53250
+41076
+6776
+27278
+40701
+13723
+25387
+20423
+21282
+56499
+41356
+53193
+17825
+8226
+5938
+10545
+35398
+42341
+5002
+48616
+118
+1854
+19457
+47245
+28034
+26256
+19308
+11119
+40804
+15161
+27192
+57256
+56795
+13757
+21523
+1990
+29060
+16911
+45716
+30104
+47456
+56140
+7129
+44887
+8425
+31547
+45015
+19132
+26316
+21805
+42922
+59318
+56646
+2559
+56001
+27174
+45604
+43889
+7978
+2153
+16063
+3893
+15841
+19827
+50977
+37370
+46923
+59844
+38208
+46018
+41661
+6787
+25298
+31325
+44838
+50220
+29722
+49477
+40492
+27564
+52048
+50522
+39573
+20750
+8455
+22065
+55737
+10024
+44971
+58043
+27397
+47389
+19956
+49412
+9202
+25218
+29632
+19089
+15577
+34808
+50562
+46310
+23015
+20072
+48681
+34695
+56340
+15994
+16476
+22842
+44730
+16490
+45098
+1075
+36609
+17507
+17455
+49243
+14003
+26233
+17753
+801
+37771
+14320
+9597
+8226
+34532
+29634
+32143
+57209
+14431
+3626
+54846
+38836
+27628
+58887
+46146
+7617
+40997
+51786
+48037
+7515
+46117
+737
+6944
+9637
+24650
+19940
+11521
+54289
+10438
+40171
+35118
+52702
+25979
+40534
+45914
+6720
+31211
+37663
+4577
+39590
+51347
+58143
+33514
+10615
+25444
+5506
+15033
+46269
+26080
+48588
+16404
+13579
+40096
+13467
+49872
+59896
+55475
+32428
+12781
+56377
+53077
+56730
+5342
+13619
+39237
+10853
+13016
+20639
+14830
+7664
+41604
+25751
+300
+35102
+19849
+15766
+12979
+31771
+32602
+31761
+43788
+57370
+35274
+18208
+13870
+15842
+2671
+5217
+25586
+57012
+20552
+12579
+26353
+45120
+43146
+15230
+53589
+50219
+50311
+6967
+6167
+7353
+3965
+10934
+9052
+25427
+33921
+42194
+12527
+17364
+10177
+33662
+20745
+32536
+41904
+19963
+22195
+22549
+6632
+9889
+56236
+53022
+35725
+42694
+16032
+38542
+24758
+1802
+40481
+28033
+16730
+27517
+42091
+4849
+18084
+59100
+3990
+27005
+17890
+18074
+19225
+27349
+42790
+5577
+3000
+45881
+18213
+55348
+47812
+33784
+27582
+30
+39035
+24942
+5179
+22934
+7829
+22250
+50897
+52136
+4512
+54522
+33580
+57999
+6
+17499
+44035
+33174
+46357
+6980
+48263
+25670
+51411
+47402
+12477
+28570
+29781
+1105
+46944
+52178
+27981
+1945
+3431
+11008
+1925
+55090
+17868
+24114
+30978
+6540
+11376
+14505
+44289
+47336
+32765
+58824
+29040
+25884
+46350
+21710
+56361
+28692
+27269
+36220
+49106
+13907
+37738
+12228
+38044
+12753
+20518
+46432
+29569
+14361
+42860
+46855
+4726
+29445
+52094
+21769
+42226
+24806
+45423
+55215
+39094
+46734
+17129
+3640
+36628
+56080
+48031
+49386
+3173
+9713
+1636
+45135
+23810
+8566
+3271
+34111
+3968
+34837
+14182
+56039
+29414
+2625
+26194
+1865
+9577
+9867
+9636
+21600
+28828
+53985
+31700
+43341
+15693
+51745
+11953
+5350
+24821
+13598
+57255
+25230
+37624
+12629
+41842
+8366
+57945
+39079
+44896
+59269
+30424
+38785
+19199
+33009
+8067
+42738
+52697
+34452
+23321
+14719
+48976
+29272
+23271
+56086
+20336
+52107
+11830
+50826
+37389
+46229
+21872
+9314
+23795
+9193
+52074
+21327
+47624
+11887
+49012
+54328
+20828
+3109
+49819
+25723
+34310
+59136
+19174
+16688
+21371
+29697
+38450
+42881
+25583
+20355
+3672
+22386
+53670
+40080
+56620
+58651
+5132
+45963
+26799
+37046
+55683
+34252
+57630
+6523
+25146
+12865
+51186
+39016
+27828
+56666
+18486
+58665
+56209
+53839
+41163
+3409
+4117
+37738
+27035
+21013
+10042
+6348
+41410
+5989
+17595
+28447
+53010
+3887
+20584
+210
+51431
+58339
+59181
+28704
+54516
+32421
+17650
+34597
+40747
+8175
+47896
+46883
+13524
+37808
+53754
+434
+37154
+4800
+26379
+6292
+59644
+3665
+24044
+54544
+7689
+18424
+47761
+31404
+13379
+34133
+3885
+32757
+5936
+52907
+28721
+29113
+20960
+21089
+23480
+55886
+15426
+59779
+23741
+41373
+58737
+18447
+11813
+37271
+34253
+43787
+20375
+9676
+21516
+6112
+44909
+45557
+18346
+11669
+7183
+48922
+27542
+40695
+16962
+49353
+32351
+16987
+12825
+1687
+45980
+47744
+15279
+53463
+34186
+18334
+51386
+46662
+17011
+33724
+31658
+5435
+36696
+43562
+30754
+57056
+44728
+22622
+23268
+19760
+44223
+19033
+58856
+24768
+45390
+6111
+27347
+19049
+58364
+53201
+26508
+28685
+49889
+32145
+24442
+50683
+29089
+58209
+11491
+31186
+44327
+28716
+2250
+35204
+9207
+34721
+8905
+30420
+50198
+6715
+25686
+37755
+33949
+48826
+45839
+53569
+37224
+48830
+10076
+51664
+36094
+24915
+19176
+52754
+43065
+43369
+56068
+17771
+21623
+37085
+1651
+33232
+51024
+38208
+31317
+38383
+13661
+35438
+46183
+37855
+37947
+14860
+3413
+55236
+19724
+7523
+1800
+18376
+52348
+19879
+53325
+50800
+53410
+57100
+15450
+51173
+19816
+15163
+36804
+33383
+36498
+29529
+14041
+23527
+14687
+17229
+26800
+26413
+22147
+9450
+52830
+24639
+49517
+48152
+50283
+52594
+15371
+40367
+43886
+45651
+44733
+17024
+44066
+55340
+11711
+59444
+57575
+50352
+37853
+15395
+50645
+4514
+24351
+55141
+32242
+18352
+30645
+53462
+32026
+15911
+47859
+2325
+2040
+20152
+16447
+7861
+32277
+45087
+29190
+50412
+26261
+13543
+32734
+5702
+24248
+12978
+10042
+57447
+499
+32466
+18431
+58556
+43702
+63
+9529
+15490
+23804
+1759
+52811
+16673
+18573
+5664
+28446
+26391
+45384
+48015
+57691
+4717
+46134
+27084
+39128
+28829
+24662
+22151
+3268
+55806
+24820
+51857
+19078
+1096
+10933
+12914
+8356
+44247
+33466
+27531
+1580
+51242
+5040
+15635
+4654
+55430
+37081
+26232
+18655
+32103
+42792
+57232
+57450
+57955
+25265
+20074
+56113
+22123
+26840
+10551
+58099
+13944
+37154
+437
+19498
+55523
+31750
+19266
+27300
+2007
+22425
+4642
+1862
+45703
+18243
+45478
+16779
+57440
+19256
+9985
diff --git a/datasets/mnist/source b/datasets/mnist/source
new file mode 100644
index 0000000..19ebae4
--- /dev/null
+++ b/datasets/mnist/source
@@ -0,0 +1 @@
+http://yann.lecun.com/exdb/mnist/
diff --git a/datasets/newsgroups/README b/datasets/newsgroups/README
new file mode 100644
index 0000000..78e92ee
--- /dev/null
+++ b/datasets/newsgroups/README
@@ -0,0 +1,7 @@
+For running the script newsgroups_extract.py we used N = 500 and topics are
+'comp.graphics', 'misc.forsale', and 'sci.med'.
+
+The randomly generated ids in our case are in the file 'newsgroups-500-3.ids'.
+
+Before running the script, be sure to generate the stopwords file using
+'stop.sh'.
diff --git a/datasets/newsgroups/newsgroups-500-3.ids b/datasets/newsgroups/newsgroups-500-3.ids
new file mode 100644
index 0000000..9b468bd
--- /dev/null
+++ b/datasets/newsgroups/newsgroups-500-3.ids
@@ -0,0 +1,1493 @@
+38998
+38380
+38224
+38260
+38642
+38718
+38728
+39495
+38338
+38696
+38984
+39068
+39665
+38657
+38944
+38811
+38421
+38324
+38580
+38430
+38541
+37950
+38483
+37940
+38794
+37913
+39053
+38412
+38916
+39677
+39644
+38502
+38960
+38594
+39022
+38777
+38855
+38865
+38909
+38511
+38831
+38813
+38522
+38558
+38504
+38273
+37947
+38802
+38354
+38639
+39016
+38808
+38608
+38772
+38673
+38928
+38452
+38887
+38795
+38864
+38484
+38305
+38975
+38833
+37918
+38856
+38344
+37936
+39670
+38776
+38547
+39063
+38731
+38353
+39064
+38760
+38220
+38231
+38459
+38889
+39072
+38878
+37958
+38278
+39642
+39496
+38962
+38948
+37932
+38700
+38963
+38393
+38405
+38753
+39035
+39671
+38445
+39012
+38896
+38787
+38839
+38259
+38560
+38803
+38751
+38613
+38595
+38822
+38610
+38996
+38945
+38845
+38933
+38530
+38474
+38981
+38265
+37935
+37961
+38650
+38599
+38486
+38229
+38364
+39002
+38321
+38299
+38645
+38505
+39031
+39061
+38882
+38921
+38747
+39017
+38913
+38559
+39041
+39030
+38535
+38911
+38809
+38296
+39624
+38397
+38233
+38345
+38458
+38871
+38854
+38255
+39034
+38375
+38872
+38791
+38232
+38646
+38544
+38314
+38980
+38286
+38716
+38720
+38997
+38792
+38886
+38714
+38605
+38685
+38769
+38899
+38582
+38736
+38799
+38683
+39623
+38306
+38551
+38652
+38532
+39067
+38986
+38804
+38668
+38626
+38274
+38373
+38999
+38644
+38498
+38978
+38834
+39622
+38357
+38262
+38287
+37943
+38991
+38322
+38313
+38492
+37948
+39620
+38550
+38327
+38816
+38609
+39011
+38542
+38835
+39647
+38763
+38715
+39656
+38987
+38239
+38587
+38507
+38627
+39636
+38623
+38957
+38451
+39650
+39049
+39669
+39039
+38968
+38399
+37945
+38312
+39059
+39019
+38693
+38369
+38801
+38477
+39044
+38525
+38641
+39640
+38733
+38702
+39037
+38234
+38294
+38332
+37952
+38466
+37931
+38443
+40027
+39004
+38619
+38298
+38325
+38348
+38807
+38455
+38876
+38869
+38767
+38800
+38863
+38555
+38439
+38625
+38337
+38284
+38840
+38764
+38914
+38361
+38884
+38765
+38904
+38493
+38310
+38953
+38584
+37938
+38972
+38471
+38655
+38995
+38225
+38925
+38930
+38713
+38958
+38531
+38628
+39490
+38330
+39673
+39010
+38476
+38758
+38832
+38651
+38276
+39637
+38433
+38588
+38669
+38223
+38311
+38387
+38721
+39079
+38881
+38852
+39737
+39488
+38723
+38251
+38597
+38389
+37914
+37930
+38554
+38830
+38724
+38741
+38867
+38947
+38745
+38977
+37925
+39626
+39066
+38413
+37957
+38617
+38690
+37939
+38496
+38464
+38221
+38748
+38775
+38618
+38942
+38632
+38257
+38612
+38927
+38331
+38382
+38631
+38365
+38660
+38729
+38398
+38826
+38691
+38407
+38812
+38253
+39006
+38342
+39634
+37953
+38759
+38810
+39062
+39076
+37921
+38261
+38434
+39054
+39615
+38381
+39047
+38782
+38280
+38992
+39661
+39084
+39014
+38424
+38779
+38749
+38983
+38637
+38709
+38686
+39027
+38323
+37962
+38923
+38228
+38931
+37916
+38099
+38727
+37942
+38844
+38966
+38843
+38746
+38347
+38317
+39040
+38708
+38790
+38665
+39075
+38938
+38725
+37924
+38937
+38437
+38674
+38692
+38527
+38400
+39638
+38562
+38744
+38526
+39042
+38340
+38902
+38825
+39069
+38570
+38362
+38564
+38275
+38903
+38403
+38581
+38536
+38567
+38377
+38781
+38906
+38328
+38216
+38624
+38293
+38240
+38545
+38635
+38891
+38877
+38463
+38658
+38994
+39020
+38890
+38850
+38654
+39668
+38814
+38515
+39023
+38292
+39643
+38666
+38352
+37963
+38719
+38319
+38411
+39676
+37922
+38303
+38436
+38244
+38820
+38621
+38680
+38773
+38687
+38450
+38252
+37261
+38269
+38444
+38989
+39655
+38579
+38552
+38670
+38320
+38533
+38620
+39001
+38846
+38734
+38241
+38238
+39080
+38429
+38499
+38982
+38277
+39057
+59193
+58141
+59454
+59029
+59020
+59187
+59453
+59121
+59531
+59581
+59514
+59533
+59622
+58050
+59015
+58114
+58799
+59297
+58092
+58065
+59388
+59603
+59069
+59640
+59580
+58997
+59021
+59381
+59604
+59119
+59353
+59318
+58918
+59480
+59234
+58866
+58113
+59341
+59136
+59635
+59301
+58053
+58981
+58778
+59415
+58940
+59007
+58048
+59006
+59028
+59396
+58796
+59455
+59001
+59062
+59180
+59436
+59463
+59299
+59075
+59320
+59236
+58118
+58836
+59628
+58931
+58943
+59158
+58963
+59345
+58978
+59265
+58851
+59024
+59063
+58111
+59254
+58830
+58055
+59197
+58824
+59014
+59190
+58785
+58071
+58897
+59131
+59639
+58955
+59044
+59092
+59224
+58093
+59225
+59334
+59200
+59369
+58774
+58797
+59220
+59059
+58129
+59351
+59192
+58133
+59491
+58898
+59145
+59448
+58869
+59198
+58578
+59270
+58760
+59241
+59085
+58794
+58854
+59404
+58790
+58127
+59417
+59250
+58888
+58881
+59571
+59434
+59539
+59025
+59642
+58817
+59129
+58989
+59115
+58841
+59372
+59058
+58098
+59641
+59594
+59401
+59142
+58967
+58877
+59513
+58954
+58070
+59469
+59385
+59357
+59368
+59595
+58942
+59041
+59364
+59356
+59052
+59358
+58947
+59082
+59346
+59430
+58867
+59451
+58973
+58977
+59228
+58770
+59153
+59522
+59347
+59185
+59556
+59414
+58970
+59100
+59287
+58878
+58904
+59602
+59002
+59428
+59498
+58112
+58153
+58052
+58064
+59573
+59507
+59138
+59196
+58789
+58910
+59481
+58951
+58577
+59515
+59040
+58798
+59591
+58884
+59077
+59398
+59336
+59268
+59378
+59140
+58920
+58091
+59487
+59575
+59027
+59510
+59425
+59540
+59276
+59079
+59106
+58938
+59047
+59127
+59303
+59344
+59638
+58807
+58969
+59389
+58825
+59034
+59512
+59326
+59122
+59629
+59419
+59033
+58155
+58919
+59205
+58862
+59493
+58808
+58121
+58966
+59496
+59380
+59506
+59148
+58968
+59291
+58109
+59476
+58894
+58145
+58883
+59625
+58054
+59468
+58140
+58900
+59212
+58932
+59111
+59466
+59026
+59076
+59202
+59049
+58139
+59211
+58956
+59097
+59505
+59472
+59046
+59517
+59072
+59199
+59030
+59373
+59342
+59156
+59444
+58934
+59500
+58096
+59330
+58082
+59400
+59309
+59605
+59548
+59474
+58833
+59566
+59070
+58056
+59157
+59537
+58569
+58103
+59518
+59288
+59273
+59132
+58944
+58783
+58777
+58926
+58084
+58820
+59606
+58046
+59402
+58911
+59008
+59362
+59160
+58154
+58840
+59172
+59178
+58764
+59210
+59061
+59176
+58804
+59088
+58110
+59023
+58834
+59449
+58871
+58946
+59286
+59031
+59327
+59627
+58902
+59619
+58142
+58852
+58806
+59048
+58868
+58810
+59585
+59379
+59329
+58964
+58803
+59467
+59460
+59089
+59139
+59446
+58896
+59587
+59154
+59532
+59482
+59599
+59504
+58099
+59264
+58809
+59054
+58045
+58787
+59223
+59624
+59179
+59393
+59471
+59313
+59081
+58985
+59435
+58126
+59376
+59253
+59144
+58800
+59339
+58846
+59169
+59509
+59564
+59528
+59354
+59577
+59134
+59159
+58849
+58839
+59123
+58152
+59203
+59516
+58568
+58843
+59439
+59488
+59281
+59382
+59343
+59167
+58850
+59112
+58857
+59011
+59102
+58150
+59569
+58078
+58057
+59257
+58801
+59275
+58079
+59305
+59283
+58979
+58971
+59394
+59258
+59238
+58873
+59597
+59151
+59416
+58991
+58061
+59440
+59022
+59289
+59464
+58922
+59411
+58786
+59352
+58876
+58097
+59530
+58811
+58819
+58782
+58791
+58909
+59308
+58100
+59280
+58072
+59328
+58829
+58086
+58880
+59242
+59319
+58831
+59064
+59039
+58805
+59590
+59547
+59091
+58123
+59623
+59584
+59080
+59243
+59399
+58905
+58832
+59648
+58950
+58813
+59244
+59226
+59208
+59101
+59003
+58802
+58769
+58891
+59422
+59104
+59045
+58089
+59125
+58766
+59544
+59095
+59568
+59479
+59427
+58107
+59099
+76462
+76866
+76902
+76127
+75855
+76143
+76013
+74765
+76477
+76491
+76528
+76247
+75893
+76494
+76594
+76111
+75881
+75987
+76497
+76248
+74763
+75892
+76343
+76115
+75843
+76001
+75884
+74732
+76054
+76263
+76396
+76517
+76074
+76086
+76165
+76808
+76596
+75920
+76305
+75873
+76507
+76936
+76292
+76077
+76571
+75903
+74778
+76200
+76011
+75978
+76043
+76505
+76819
+74758
+76085
+76137
+75986
+76856
+76091
+76666
+76339
+76344
+76833
+76347
+75993
+76785
+76169
+76644
+75898
+76863
+76504
+76789
+75944
+75848
+75849
+75941
+76816
+76476
+76008
+76322
+76470
+76755
+74757
+76672
+75945
+74818
+75951
+76160
+76194
+76044
+75851
+75868
+75902
+74734
+74819
+75852
+76677
+76927
+76087
+76445
+76342
+75933
+76363
+75984
+76084
+76014
+76082
+76420
+76678
+74735
+74773
+76238
+76804
+75889
+75869
+76117
+76052
+76639
+76098
+76102
+76475
+76312
+76070
+76661
+76119
+74739
+76291
+74794
+76093
+76217
+76259
+76245
+76360
+75926
+75942
+74795
+76467
+76302
+76126
+76640
+74725
+76276
+76662
+76473
+75896
+74806
+75930
+75916
+76586
+74820
+76432
+75911
+76063
+74744
+76224
+76056
+75983
+76213
+76496
+76189
+76834
+74729
+76679
+76813
+76488
+76274
+76072
+76794
+75862
+76880
+76850
+76021
+76944
+74746
+74830
+75888
+76367
+75953
+75969
+76003
+74807
+76825
+76848
+76406
+76826
+76050
+76176
+76334
+76308
+76311
+76250
+76882
+76231
+75982
+76482
+75980
+75918
+76329
+75905
+76664
+76298
+76490
+76188
+75872
+76215
+76353
+76458
+76521
+76538
+76840
+76453
+76134
+74769
+76421
+76057
+76288
+76065
+76831
+76704
+76624
+76434
+76567
+76253
+76107
+74730
+76824
+76149
+76184
+74814
+76258
+76252
+76139
+76278
+76812
+76506
+76537
+76170
+76835
+76096
+75971
+74828
+76757
+76845
+75904
+75051
+76140
+76486
+76459
+74792
+76362
+76364
+77056
+74781
+76359
+74776
+76046
+76281
+76844
+76516
+74743
+76261
+76381
+74740
+76108
+76132
+76378
+76405
+76161
+76608
+76133
+76275
+76783
+74741
+76198
+76435
+75854
+76606
+76301
+76485
+74768
+76038
+75997
+76837
+76851
+76860
+75989
+76414
+76350
+76852
+76214
+76030
+74774
+76481
+75947
+76446
+76124
+76512
+76151
+76318
+76089
+76556
+76478
+76118
+76180
+74824
+74150
+76075
+76101
+76781
+76418
+76500
+76349
+76182
+76881
+76024
+76039
+76450
+76045
+76109
+76489
+76079
+76809
+76939
+74772
+76510
+76658
+76069
+76601
+75912
+76438
+76691
+76078
+76270
+76447
+76461
+76861
+75891
+76437
+76314
+74798
+76254
+76192
+76836
+76240
+76878
+74727
+76779
+74761
+76148
+76522
+75846
+76417
+75876
+76141
+76233
+76425
+76547
+76940
+76800
+76357
+76212
+76328
+76827
+76650
+76558
+76649
+76372
+76251
+76655
+74810
+76416
+75929
+75948
+76681
+75908
+76842
+75927
+75940
+75842
+75864
+76935
+75994
+76832
+76209
+76310
+76597
+75906
+76659
+76671
+76035
+76391
+74808
+76868
+76591
+74797
+76455
+76546
+76853
+76120
+76053
+76273
+76144
+76092
+75958
+76460
+76879
+76463
+76479
+75860
+75992
+76792
+75955
+76574
+76515
+75962
+76123
+76648
+76317
+74720
+74826
+76685
+76348
+75946
+76220
+76563
+76204
+76578
+76855
+76048
+76088
+76060
+76025
+76553
+75883
+76227
+75961
+76508
+76145
+76793
+76181
+74785
+74721
+76193
+76041
+76171
+76638
+76210
+76645
+76256
+75899
+76099
+76823
+75981
+74752
+76673
+76095
+76324
+75879
+75895
+76131
+76780
+76229
+76820
+74726
+76023
+76299
+76796
+76152
+76599
+76354
+76575
+76456
+76419
+76572
+76545
+76603
+76287
+76600
+76315
+76647
+76684
+76172
+76177
+74804
+76665
+76340
+77014
+76094
+76582
+76338
+74829
+74793
+76174
+76199
+74736
+76570
diff --git a/datasets/newsgroups/newsgroups_extract.py b/datasets/newsgroups/newsgroups_extract.py
new file mode 100644
index 0000000..51c5030
--- /dev/null
+++ b/datasets/newsgroups/newsgroups_extract.py
@@ -0,0 +1,137 @@
+from sklearn.decomposition import PCA
+from sklearn.feature_extraction.text import TfidfVectorizer
+
+import hashlib
+import logging
+import numpy as np
+import os
+import os.path
+import sys
+import tarfile
+import wget
+
+
+DATA_URL = "http://kdd.ics.uci.edu/databases/20newsgroups/20_newsgroups.tar.gz"
+DATA_FILE = "20_newsgroups.tar.gz"
+DATA_SHA256 = "b7bbf82b7831f7dbb1a09d9312f66fa78565c8de25526999b0d66f69d37e414"
+
+
+def build_topic_corpus(corpus_file, n, topic):
+ logging.info("Extracting corpus for topic '{}'".format(topic))
+ topic_items = []
+ names = corpus_file.getnames()
+ for name in names:
+ if topic in name:
+ ti = corpus_file.getmember(name)
+ if ti.isfile():
+ topic_items.append(name)
+ if len(topic_items) == 0:
+ # Topic does not exist (no items fetched)
+ raise ValueError(topic)
+
+ topic_ids = []
+ topic_corpus = []
+ indices = np.arange(len(topic_items))
+ np.random.shuffle(indices)
+ indices = indices[:n]
+ for i in indices:
+ ti = corpus_file.getmember(topic_items[i])
+ with corpus_file.extractfile(ti) as f:
+ try:
+ contents = str(f.read(), encoding="utf8")
+ except ValueError as e:
+ logging.warn("Encoding error in '{}': {}".format(ti.name, e))
+ continue
+ _, item_id = os.path.split(ti.name)
+ topic_ids.append(item_id)
+ topic_corpus.append(contents)
+
+ return topic_ids, topic_corpus
+
+
+def build_corpus(n, topics):
+ """
+ Builds a corpus with each topic, with N items each.
+ Returns a list of document IDs and a corpus which is a dict where each topic
+ is a key mapped to a list of document contents.
+ """
+ ids = []
+ corpus = dict()
+ with tarfile.open(DATA_FILE, "r:gz") as f:
+ for topic in topics:
+ topic_ids, topic_corpus = build_topic_corpus(f, n, topic)
+ corpus[topic] = topic_corpus
+ ids.extend(topic_ids)
+ return ids, corpus
+
+
+if __name__ == "__main__":
+ if len(sys.argv) < 4:
+ print("usage: {} STOP_WORDS N TOPIC [ TOPIC [ ... ] ]".format(sys.argv[0]))
+ print("The program reads the file STOP_WORDS for stop words, extracts"
+ + " and generates a BoW model from N random articles of each TOPIC")
+ exit(1)
+
+ logging.basicConfig(filename="newsgroups_extract.log",
+ format="%(levelname)s:%(message)s",
+ level=logging.INFO)
+
+ if not os.path.exists(DATA_FILE):
+ logging.info("Downloading data from '{}'".format(DATA_URL))
+ wget.download(DATA_URL, DATA_FILE)
+ with open(DATA_FILE, "rb") as f:
+ if not hashlib.sha256(f.read()).hexdigest() != DATA_SHA256:
+ logging.error("'{}' is corrupted; aborting".format(DATA_FILE))
+ exit(1)
+
+ # Read stop words list
+ try:
+ with open(sys.argv[1]) as stop_words_file:
+ stop_words = stop_words_file.read().split()
+ except Exception as e:
+ logging.error("Could not read stop words: {}".format(e))
+ exit(1)
+
+ try:
+ n = int(sys.argv[2])
+ if (n < 2) or (n > 1000):
+ raise ValueError("N must be between 2 and 1000")
+ except ValueError as e:
+ logging.error("Invalid argument: {}".format(e))
+ exit(1)
+
+ # Extract text corpus from tarball
+ logging.info("Building corpus")
+ topics = sys.argv[3:]
+ try:
+ ids, corpus = build_corpus(n, topics)
+ except ValueError as e:
+ logging.error("Invalid topic: {}".format(e))
+ exit(1)
+
+ corpus_text = []
+ for topic_items in corpus.values():
+ corpus_text.extend(topic_items)
+
+ # Compute the TF-IDF matrix
+ logging.info("Computing TF-IDF matrix")
+ vectorizer = TfidfVectorizer(min_df=0.01, stop_words=stop_words)
+ X = vectorizer.fit_transform(corpus_text)
+
+ # Reduce data dimensionality using PCA
+ logging.info("Computing PCA and reducing to 512 dimensions")
+ X = PCA(n_components=512, whiten=True).fit_transform(X.toarray())
+
+ # Save all extracted features and related data
+ logging.info("Writing IDs file")
+ ids_fname = "newsgroups-{}-{}.ids".format(n, len(topics))
+ np.savetxt(ids_fname, ids, fmt="%s")
+
+ logging.info("Writing table file")
+ tbl_fname = "newsgroups-{}-{}.tbl".format(n, len(topics))
+ np.savetxt(tbl_fname, X.todense(), fmt="%f")
+
+ logging.info("Writing labels file")
+ labels_fname = "newsgroups-{}-{}.labels".format(n, len(topics))
+ counts = [len(topic_items) for topic_items in corpus.values()]
+ np.savetxt(labels_fname, np.repeat(topics, counts), fmt="%s")
diff --git a/datasets/newsgroups/source b/datasets/newsgroups/source
new file mode 100644
index 0000000..764f792
--- /dev/null
+++ b/datasets/newsgroups/source
@@ -0,0 +1 @@
+http://kdd.ics.uci.edu/databases/20newsgroups/20newsgroups.html
diff --git a/datasets/newsgroups/stop.sh b/datasets/newsgroups/stop.sh
new file mode 100644
index 0000000..36a5f74
--- /dev/null
+++ b/datasets/newsgroups/stop.sh
@@ -0,0 +1,12 @@
+# stop.sh
+#
+# Generate proper stop words list from the 'stop.txt' file.
+
+
+# Original source: http://snowball.tartarus.org/algorithms/english/stop.txt
+# NOTE: in our experiments, stop.txt has been modified to include the last stop
+# words (stop.txt is included).
+
+sed 's/|.*//g' <stop.txt \
+ | sed 's/ \+//g' \
+ | sed '/^$/d' >words.txt
diff --git a/datasets/newsgroups/stop.txt b/datasets/newsgroups/stop.txt
new file mode 100644
index 0000000..5d0a34b
--- /dev/null
+++ b/datasets/newsgroups/stop.txt
@@ -0,0 +1,310 @@
+
+ | An English stop word list. Comments begin with vertical bar. Each stop
+ | word is at the start of a line.
+
+ | Many of the forms below are quite rare (e.g. "yourselves") but included for
+ | completeness.
+
+ | PRONOUNS FORMS
+ | 1st person sing
+
+i | subject, always in upper case of course
+
+me | object
+my | possessive adjective
+ | the possessive pronoun `mine' is best suppressed, because of the
+ | sense of coal-mine etc.
+myself | reflexive
+ | 1st person plural
+we | subject
+
+| us | object
+ | care is required here because US = United States. It is usually
+ | safe to remove it if it is in lower case.
+our | possessive adjective
+ours | possessive pronoun
+ourselves | reflexive
+ | second person (archaic `thou' forms not included)
+you | subject and object
+your | possessive adjective
+yours | possessive pronoun
+yourself | reflexive (singular)
+yourselves | reflexive (plural)
+ | third person singular
+he | subject
+him | object
+his | possessive adjective and pronoun
+himself | reflexive
+
+she | subject
+her | object and possessive adjective
+hers | possessive pronoun
+herself | reflexive
+
+it | subject and object
+its | possessive adjective
+itself | reflexive
+ | third person plural
+they | subject
+them | object
+their | possessive adjective
+theirs | possessive pronoun
+themselves | reflexive
+ | other forms (demonstratives, interrogatives)
+what
+which
+who
+whom
+this
+that
+these
+those
+
+ | VERB FORMS (using F.R. Palmer's nomenclature)
+ | BE
+am | 1st person, present
+is | -s form (3rd person, present)
+are | present
+was | 1st person, past
+were | past
+be | infinitive
+been | past participle
+being | -ing form
+ | HAVE
+have | simple
+has | -s form
+had | past
+having | -ing form
+ | DO
+do | simple
+does | -s form
+did | past
+doing | -ing form
+
+ | The forms below are, I believe, best omitted, because of the significant
+ | homonym forms:
+
+ | He made a WILL
+ | old tin CAN
+ | merry month of MAY
+ | a smell of MUST
+ | fight the good fight with all thy MIGHT
+
+ | would, could, should, ought might however be included
+
+ | | AUXILIARIES
+ | | WILL
+ |will
+
+would
+
+ | | SHALL
+ |shall
+
+should
+
+ | | CAN
+ |can
+
+could
+
+ | | MAY
+ |may
+ |might
+ | | MUST
+ |must
+ | | OUGHT
+
+ought
+
+ | COMPOUND FORMS, increasingly encountered nowadays in 'formal' writing
+ | pronoun + verb
+
+i'm
+you're
+he's
+she's
+it's
+we're
+they're
+i've
+you've
+we've
+they've
+i'd
+you'd
+he'd
+she'd
+we'd
+they'd
+i'll
+you'll
+he'll
+she'll
+we'll
+they'll
+
+ | verb + negation
+
+isn't
+aren't
+wasn't
+weren't
+hasn't
+haven't
+hadn't
+doesn't
+don't
+didn't
+
+ | auxiliary + negation
+
+won't
+wouldn't
+shan't
+shouldn't
+can't
+cannot
+couldn't
+mustn't
+
+ | miscellaneous forms
+
+let's
+that's
+who's
+what's
+here's
+there's
+when's
+where's
+why's
+how's
+
+ | rarer forms
+
+ | daren't needn't
+
+ | doubtful forms
+
+ | oughtn't mightn't
+
+ | ARTICLES
+a
+an
+the
+
+ | THE REST (Overlap among prepositions, conjunctions, adverbs etc is so
+ | high, that classification is pointless.)
+and
+but
+if
+or
+because
+as
+until
+while
+
+of
+at
+by
+for
+with
+about
+against
+between
+into
+through
+during
+before
+after
+above
+below
+to
+from
+up
+down
+in
+out
+on
+off
+over
+under
+
+again
+further
+then
+once
+
+here
+there
+when
+where
+why
+how
+
+all
+any
+both
+each
+few
+more
+most
+other
+some
+such
+
+no
+nor
+not
+only
+own
+same
+so
+than
+too
+very
+
+one
+every
+least
+less
+many
+now
+ever
+never
+say
+says
+said
+also
+get
+go
+goes
+just
+made
+make
+put
+see
+seen
+whether
+like
+well
+back
+even
+still
+way
+take
+since
+another
+however
+two
+three
+four
+five
+first
+second
+new
+old
+high
+long
+
diff --git a/datasets/newsgroups/words.txt b/datasets/newsgroups/words.txt
new file mode 100644
index 0000000..0d11300
--- /dev/null
+++ b/datasets/newsgroups/words.txt
@@ -0,0 +1,216 @@
+i
+me
+my
+myself
+we
+our
+ours
+ourselves
+you
+your
+yours
+yourself
+yourselves
+he
+him
+his
+himself
+she
+her
+hers
+herself
+it
+its
+itself
+they
+them
+their
+theirs
+themselves
+what
+which
+who
+whom
+this
+that
+these
+those
+am
+is
+are
+was
+were
+be
+been
+being
+have
+has
+had
+having
+do
+does
+did
+doing
+would
+should
+could
+ought
+i'm
+you're
+he's
+she's
+it's
+we're
+they're
+i've
+you've
+we've
+they've
+i'd
+you'd
+he'd
+she'd
+we'd
+they'd
+i'll
+you'll
+he'll
+she'll
+we'll
+they'll
+isn't
+aren't
+wasn't
+weren't
+hasn't
+haven't
+hadn't
+doesn't
+don't
+didn't
+won't
+wouldn't
+shan't
+shouldn't
+can't
+cannot
+couldn't
+mustn't
+let's
+that's
+who's
+what's
+here's
+there's
+when's
+where's
+why's
+how's
+a
+an
+the
+and
+but
+if
+or
+because
+as
+until
+while
+of
+at
+by
+for
+with
+about
+against
+between
+into
+through
+during
+before
+after
+above
+below
+to
+from
+up
+down
+in
+out
+on
+off
+over
+under
+again
+further
+then
+once
+here
+there
+when
+where
+why
+how
+all
+any
+both
+each
+few
+more
+most
+other
+some
+such
+no
+nor
+not
+only
+own
+same
+so
+than
+too
+very
+one
+every
+least
+less
+many
+now
+ever
+never
+say
+says
+said
+also
+get
+go
+goes
+just
+made
+make
+put
+see
+seen
+whether
+like
+well
+back
+even
+still
+way
+take
+since
+another
+however
+two
+three
+four
+five
+first
+second
+new
+old
+high
+long
diff --git a/datasets/segmentation/segmentation_extract.py b/datasets/segmentation/segmentation_extract.py
new file mode 100644
index 0000000..e621161
--- /dev/null
+++ b/datasets/segmentation/segmentation_extract.py
@@ -0,0 +1,39 @@
+import hashlib
+import logging
+import pandas as pd
+import os
+import os.path
+import wget
+
+
+DATA_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/image/segmentation.test"
+DATA_SHA256 = "2e9e966479d54c6aaec309059376dd9c89c1b46bf3a23aceeefb36d20d93a189"
+DATA_FILE = "segmentation.test"
+
+
+if __name__ == "__main__":
+ logging.basicConfig(filename="segmentation_extract.log",
+ format="%(levelname)s:%(message)s",
+ level=logging.INFO)
+
+ if not os.path.exists(DATA_FILE):
+ logging.info("Downloading '{}'".format(DATA_URL))
+ wget.download(DATA_URL, DATA_FILE)
+ with open(DATA_FILE, "rb") as f:
+ if hashlib.sha256(f.read()).hexdigest() != DATA_SHA256:
+ logging.error("{} is corrupted; aborting".format(DATA_FILE))
+
+
+ df = pd.read_table(DATA_FILE, header=None, skiprows=4, delimiter=",")
+
+ # First column contains class names, which we convert to numbers using the
+ # 'class_labels' dict
+ classes = set(df[0])
+ numbers = [i for i in range(len(classes))]
+ class_labels = dict(zip(classes, numbers))
+
+ data = df.drop([0, 3], axis=1)
+ data.to_csv("segmentation.tbl", sep=" ", index=False, header=False)
+
+ labels = df[0].apply(lambda x: class_labels[x])
+ labels.to_csv("segmentation.labels", sep=" ", index=False, header=False)
diff --git a/datasets/segmentation/source b/datasets/segmentation/source
new file mode 100644
index 0000000..ab98436
--- /dev/null
+++ b/datasets/segmentation/source
@@ -0,0 +1 @@
+https://archive.ics.uci.edu/ml/datasets/Image+Segmentation
diff --git a/datasets/wdbc/source b/datasets/wdbc/source
new file mode 100644
index 0000000..67d201a
--- /dev/null
+++ b/datasets/wdbc/source
@@ -0,0 +1 @@
+http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/
diff --git a/datasets/wdbc/wdbc_extract.py b/datasets/wdbc/wdbc_extract.py
new file mode 100644
index 0000000..9b6b84a
--- /dev/null
+++ b/datasets/wdbc/wdbc_extract.py
@@ -0,0 +1,34 @@
+import hashlib
+import logging
+import pandas as pd
+import os
+import os.path
+import wget
+
+
+DATA_URL = "http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
+DATA_SHA256 = "d606af411f3e5be8a317a5a8b652b425aaf0ff38ca683d5327ffff94c3695f4a"
+DATA_FILE = "wdbc.data"
+
+
+if __name__ == "__main__":
+ logging.basicConfig(filename="wdbc_extract.log",
+ format="%(levelname)s:%(message)s",
+ level=logging.INFO)
+
+ if not os.path.exists(DATA_FILE):
+ logging.info("Downloading '{}".format(DATA_URL))
+ wget.download(DATA_URL, DATA_FILE)
+ with open(DATA_FILE, "rb") as f:
+ if hashlib.sha256(f.read()).hexdigest() != DATA_SHA256:
+ logging.error("'{}' is corrupted; aborting".format(DATA_FILE))
+ exit(1)
+
+ data = pd.read_table(DATA_FILE, header=None, delimiter=",")
+ wdbc_ids = data[0]
+ wdbc_labels = data[1]
+ wdbc = data.drop([0, 1], axis=1)
+
+ wdbc.to_csv("wdbc.tbl", sep=" ", index=False, header=False)
+ wdbc_labels.to_csv("wdbc.labels", sep=" ", index=False, header=False)
+ wdbc_ids.to_csv("wdbc.ids", sep=" ", index=False, header=False)