aboutsummaryrefslogtreecommitdiff
path: root/datasets/newsgroups
diff options
context:
space:
mode:
Diffstat (limited to 'datasets/newsgroups')
-rw-r--r--datasets/newsgroups/README7
-rw-r--r--datasets/newsgroups/newsgroups-500-3.ids1493
-rw-r--r--datasets/newsgroups/newsgroups_extract.py137
-rw-r--r--datasets/newsgroups/source1
-rw-r--r--datasets/newsgroups/stop.sh12
-rw-r--r--datasets/newsgroups/stop.txt310
-rw-r--r--datasets/newsgroups/words.txt216
7 files changed, 2176 insertions, 0 deletions
diff --git a/datasets/newsgroups/README b/datasets/newsgroups/README
new file mode 100644
index 0000000..78e92ee
--- /dev/null
+++ b/datasets/newsgroups/README
@@ -0,0 +1,7 @@
+For running the script newsgroups_extract.py we used N = 500 and topics are
+'comp.graphics', 'misc.forsale', and 'sci.med'.
+
+The randomly generated ids in our case are in the file 'newsgroups-500-3.ids'.
+
+Before running the script, be sure to generate the stopwords file using
+'stop.sh'.
diff --git a/datasets/newsgroups/newsgroups-500-3.ids b/datasets/newsgroups/newsgroups-500-3.ids
new file mode 100644
index 0000000..9b468bd
--- /dev/null
+++ b/datasets/newsgroups/newsgroups-500-3.ids
@@ -0,0 +1,1493 @@
+38998
+38380
+38224
+38260
+38642
+38718
+38728
+39495
+38338
+38696
+38984
+39068
+39665
+38657
+38944
+38811
+38421
+38324
+38580
+38430
+38541
+37950
+38483
+37940
+38794
+37913
+39053
+38412
+38916
+39677
+39644
+38502
+38960
+38594
+39022
+38777
+38855
+38865
+38909
+38511
+38831
+38813
+38522
+38558
+38504
+38273
+37947
+38802
+38354
+38639
+39016
+38808
+38608
+38772
+38673
+38928
+38452
+38887
+38795
+38864
+38484
+38305
+38975
+38833
+37918
+38856
+38344
+37936
+39670
+38776
+38547
+39063
+38731
+38353
+39064
+38760
+38220
+38231
+38459
+38889
+39072
+38878
+37958
+38278
+39642
+39496
+38962
+38948
+37932
+38700
+38963
+38393
+38405
+38753
+39035
+39671
+38445
+39012
+38896
+38787
+38839
+38259
+38560
+38803
+38751
+38613
+38595
+38822
+38610
+38996
+38945
+38845
+38933
+38530
+38474
+38981
+38265
+37935
+37961
+38650
+38599
+38486
+38229
+38364
+39002
+38321
+38299
+38645
+38505
+39031
+39061
+38882
+38921
+38747
+39017
+38913
+38559
+39041
+39030
+38535
+38911
+38809
+38296
+39624
+38397
+38233
+38345
+38458
+38871
+38854
+38255
+39034
+38375
+38872
+38791
+38232
+38646
+38544
+38314
+38980
+38286
+38716
+38720
+38997
+38792
+38886
+38714
+38605
+38685
+38769
+38899
+38582
+38736
+38799
+38683
+39623
+38306
+38551
+38652
+38532
+39067
+38986
+38804
+38668
+38626
+38274
+38373
+38999
+38644
+38498
+38978
+38834
+39622
+38357
+38262
+38287
+37943
+38991
+38322
+38313
+38492
+37948
+39620
+38550
+38327
+38816
+38609
+39011
+38542
+38835
+39647
+38763
+38715
+39656
+38987
+38239
+38587
+38507
+38627
+39636
+38623
+38957
+38451
+39650
+39049
+39669
+39039
+38968
+38399
+37945
+38312
+39059
+39019
+38693
+38369
+38801
+38477
+39044
+38525
+38641
+39640
+38733
+38702
+39037
+38234
+38294
+38332
+37952
+38466
+37931
+38443
+40027
+39004
+38619
+38298
+38325
+38348
+38807
+38455
+38876
+38869
+38767
+38800
+38863
+38555
+38439
+38625
+38337
+38284
+38840
+38764
+38914
+38361
+38884
+38765
+38904
+38493
+38310
+38953
+38584
+37938
+38972
+38471
+38655
+38995
+38225
+38925
+38930
+38713
+38958
+38531
+38628
+39490
+38330
+39673
+39010
+38476
+38758
+38832
+38651
+38276
+39637
+38433
+38588
+38669
+38223
+38311
+38387
+38721
+39079
+38881
+38852
+39737
+39488
+38723
+38251
+38597
+38389
+37914
+37930
+38554
+38830
+38724
+38741
+38867
+38947
+38745
+38977
+37925
+39626
+39066
+38413
+37957
+38617
+38690
+37939
+38496
+38464
+38221
+38748
+38775
+38618
+38942
+38632
+38257
+38612
+38927
+38331
+38382
+38631
+38365
+38660
+38729
+38398
+38826
+38691
+38407
+38812
+38253
+39006
+38342
+39634
+37953
+38759
+38810
+39062
+39076
+37921
+38261
+38434
+39054
+39615
+38381
+39047
+38782
+38280
+38992
+39661
+39084
+39014
+38424
+38779
+38749
+38983
+38637
+38709
+38686
+39027
+38323
+37962
+38923
+38228
+38931
+37916
+38099
+38727
+37942
+38844
+38966
+38843
+38746
+38347
+38317
+39040
+38708
+38790
+38665
+39075
+38938
+38725
+37924
+38937
+38437
+38674
+38692
+38527
+38400
+39638
+38562
+38744
+38526
+39042
+38340
+38902
+38825
+39069
+38570
+38362
+38564
+38275
+38903
+38403
+38581
+38536
+38567
+38377
+38781
+38906
+38328
+38216
+38624
+38293
+38240
+38545
+38635
+38891
+38877
+38463
+38658
+38994
+39020
+38890
+38850
+38654
+39668
+38814
+38515
+39023
+38292
+39643
+38666
+38352
+37963
+38719
+38319
+38411
+39676
+37922
+38303
+38436
+38244
+38820
+38621
+38680
+38773
+38687
+38450
+38252
+37261
+38269
+38444
+38989
+39655
+38579
+38552
+38670
+38320
+38533
+38620
+39001
+38846
+38734
+38241
+38238
+39080
+38429
+38499
+38982
+38277
+39057
+59193
+58141
+59454
+59029
+59020
+59187
+59453
+59121
+59531
+59581
+59514
+59533
+59622
+58050
+59015
+58114
+58799
+59297
+58092
+58065
+59388
+59603
+59069
+59640
+59580
+58997
+59021
+59381
+59604
+59119
+59353
+59318
+58918
+59480
+59234
+58866
+58113
+59341
+59136
+59635
+59301
+58053
+58981
+58778
+59415
+58940
+59007
+58048
+59006
+59028
+59396
+58796
+59455
+59001
+59062
+59180
+59436
+59463
+59299
+59075
+59320
+59236
+58118
+58836
+59628
+58931
+58943
+59158
+58963
+59345
+58978
+59265
+58851
+59024
+59063
+58111
+59254
+58830
+58055
+59197
+58824
+59014
+59190
+58785
+58071
+58897
+59131
+59639
+58955
+59044
+59092
+59224
+58093
+59225
+59334
+59200
+59369
+58774
+58797
+59220
+59059
+58129
+59351
+59192
+58133
+59491
+58898
+59145
+59448
+58869
+59198
+58578
+59270
+58760
+59241
+59085
+58794
+58854
+59404
+58790
+58127
+59417
+59250
+58888
+58881
+59571
+59434
+59539
+59025
+59642
+58817
+59129
+58989
+59115
+58841
+59372
+59058
+58098
+59641
+59594
+59401
+59142
+58967
+58877
+59513
+58954
+58070
+59469
+59385
+59357
+59368
+59595
+58942
+59041
+59364
+59356
+59052
+59358
+58947
+59082
+59346
+59430
+58867
+59451
+58973
+58977
+59228
+58770
+59153
+59522
+59347
+59185
+59556
+59414
+58970
+59100
+59287
+58878
+58904
+59602
+59002
+59428
+59498
+58112
+58153
+58052
+58064
+59573
+59507
+59138
+59196
+58789
+58910
+59481
+58951
+58577
+59515
+59040
+58798
+59591
+58884
+59077
+59398
+59336
+59268
+59378
+59140
+58920
+58091
+59487
+59575
+59027
+59510
+59425
+59540
+59276
+59079
+59106
+58938
+59047
+59127
+59303
+59344
+59638
+58807
+58969
+59389
+58825
+59034
+59512
+59326
+59122
+59629
+59419
+59033
+58155
+58919
+59205
+58862
+59493
+58808
+58121
+58966
+59496
+59380
+59506
+59148
+58968
+59291
+58109
+59476
+58894
+58145
+58883
+59625
+58054
+59468
+58140
+58900
+59212
+58932
+59111
+59466
+59026
+59076
+59202
+59049
+58139
+59211
+58956
+59097
+59505
+59472
+59046
+59517
+59072
+59199
+59030
+59373
+59342
+59156
+59444
+58934
+59500
+58096
+59330
+58082
+59400
+59309
+59605
+59548
+59474
+58833
+59566
+59070
+58056
+59157
+59537
+58569
+58103
+59518
+59288
+59273
+59132
+58944
+58783
+58777
+58926
+58084
+58820
+59606
+58046
+59402
+58911
+59008
+59362
+59160
+58154
+58840
+59172
+59178
+58764
+59210
+59061
+59176
+58804
+59088
+58110
+59023
+58834
+59449
+58871
+58946
+59286
+59031
+59327
+59627
+58902
+59619
+58142
+58852
+58806
+59048
+58868
+58810
+59585
+59379
+59329
+58964
+58803
+59467
+59460
+59089
+59139
+59446
+58896
+59587
+59154
+59532
+59482
+59599
+59504
+58099
+59264
+58809
+59054
+58045
+58787
+59223
+59624
+59179
+59393
+59471
+59313
+59081
+58985
+59435
+58126
+59376
+59253
+59144
+58800
+59339
+58846
+59169
+59509
+59564
+59528
+59354
+59577
+59134
+59159
+58849
+58839
+59123
+58152
+59203
+59516
+58568
+58843
+59439
+59488
+59281
+59382
+59343
+59167
+58850
+59112
+58857
+59011
+59102
+58150
+59569
+58078
+58057
+59257
+58801
+59275
+58079
+59305
+59283
+58979
+58971
+59394
+59258
+59238
+58873
+59597
+59151
+59416
+58991
+58061
+59440
+59022
+59289
+59464
+58922
+59411
+58786
+59352
+58876
+58097
+59530
+58811
+58819
+58782
+58791
+58909
+59308
+58100
+59280
+58072
+59328
+58829
+58086
+58880
+59242
+59319
+58831
+59064
+59039
+58805
+59590
+59547
+59091
+58123
+59623
+59584
+59080
+59243
+59399
+58905
+58832
+59648
+58950
+58813
+59244
+59226
+59208
+59101
+59003
+58802
+58769
+58891
+59422
+59104
+59045
+58089
+59125
+58766
+59544
+59095
+59568
+59479
+59427
+58107
+59099
+76462
+76866
+76902
+76127
+75855
+76143
+76013
+74765
+76477
+76491
+76528
+76247
+75893
+76494
+76594
+76111
+75881
+75987
+76497
+76248
+74763
+75892
+76343
+76115
+75843
+76001
+75884
+74732
+76054
+76263
+76396
+76517
+76074
+76086
+76165
+76808
+76596
+75920
+76305
+75873
+76507
+76936
+76292
+76077
+76571
+75903
+74778
+76200
+76011
+75978
+76043
+76505
+76819
+74758
+76085
+76137
+75986
+76856
+76091
+76666
+76339
+76344
+76833
+76347
+75993
+76785
+76169
+76644
+75898
+76863
+76504
+76789
+75944
+75848
+75849
+75941
+76816
+76476
+76008
+76322
+76470
+76755
+74757
+76672
+75945
+74818
+75951
+76160
+76194
+76044
+75851
+75868
+75902
+74734
+74819
+75852
+76677
+76927
+76087
+76445
+76342
+75933
+76363
+75984
+76084
+76014
+76082
+76420
+76678
+74735
+74773
+76238
+76804
+75889
+75869
+76117
+76052
+76639
+76098
+76102
+76475
+76312
+76070
+76661
+76119
+74739
+76291
+74794
+76093
+76217
+76259
+76245
+76360
+75926
+75942
+74795
+76467
+76302
+76126
+76640
+74725
+76276
+76662
+76473
+75896
+74806
+75930
+75916
+76586
+74820
+76432
+75911
+76063
+74744
+76224
+76056
+75983
+76213
+76496
+76189
+76834
+74729
+76679
+76813
+76488
+76274
+76072
+76794
+75862
+76880
+76850
+76021
+76944
+74746
+74830
+75888
+76367
+75953
+75969
+76003
+74807
+76825
+76848
+76406
+76826
+76050
+76176
+76334
+76308
+76311
+76250
+76882
+76231
+75982
+76482
+75980
+75918
+76329
+75905
+76664
+76298
+76490
+76188
+75872
+76215
+76353
+76458
+76521
+76538
+76840
+76453
+76134
+74769
+76421
+76057
+76288
+76065
+76831
+76704
+76624
+76434
+76567
+76253
+76107
+74730
+76824
+76149
+76184
+74814
+76258
+76252
+76139
+76278
+76812
+76506
+76537
+76170
+76835
+76096
+75971
+74828
+76757
+76845
+75904
+75051
+76140
+76486
+76459
+74792
+76362
+76364
+77056
+74781
+76359
+74776
+76046
+76281
+76844
+76516
+74743
+76261
+76381
+74740
+76108
+76132
+76378
+76405
+76161
+76608
+76133
+76275
+76783
+74741
+76198
+76435
+75854
+76606
+76301
+76485
+74768
+76038
+75997
+76837
+76851
+76860
+75989
+76414
+76350
+76852
+76214
+76030
+74774
+76481
+75947
+76446
+76124
+76512
+76151
+76318
+76089
+76556
+76478
+76118
+76180
+74824
+74150
+76075
+76101
+76781
+76418
+76500
+76349
+76182
+76881
+76024
+76039
+76450
+76045
+76109
+76489
+76079
+76809
+76939
+74772
+76510
+76658
+76069
+76601
+75912
+76438
+76691
+76078
+76270
+76447
+76461
+76861
+75891
+76437
+76314
+74798
+76254
+76192
+76836
+76240
+76878
+74727
+76779
+74761
+76148
+76522
+75846
+76417
+75876
+76141
+76233
+76425
+76547
+76940
+76800
+76357
+76212
+76328
+76827
+76650
+76558
+76649
+76372
+76251
+76655
+74810
+76416
+75929
+75948
+76681
+75908
+76842
+75927
+75940
+75842
+75864
+76935
+75994
+76832
+76209
+76310
+76597
+75906
+76659
+76671
+76035
+76391
+74808
+76868
+76591
+74797
+76455
+76546
+76853
+76120
+76053
+76273
+76144
+76092
+75958
+76460
+76879
+76463
+76479
+75860
+75992
+76792
+75955
+76574
+76515
+75962
+76123
+76648
+76317
+74720
+74826
+76685
+76348
+75946
+76220
+76563
+76204
+76578
+76855
+76048
+76088
+76060
+76025
+76553
+75883
+76227
+75961
+76508
+76145
+76793
+76181
+74785
+74721
+76193
+76041
+76171
+76638
+76210
+76645
+76256
+75899
+76099
+76823
+75981
+74752
+76673
+76095
+76324
+75879
+75895
+76131
+76780
+76229
+76820
+74726
+76023
+76299
+76796
+76152
+76599
+76354
+76575
+76456
+76419
+76572
+76545
+76603
+76287
+76600
+76315
+76647
+76684
+76172
+76177
+74804
+76665
+76340
+77014
+76094
+76582
+76338
+74829
+74793
+76174
+76199
+74736
+76570
diff --git a/datasets/newsgroups/newsgroups_extract.py b/datasets/newsgroups/newsgroups_extract.py
new file mode 100644
index 0000000..51c5030
--- /dev/null
+++ b/datasets/newsgroups/newsgroups_extract.py
@@ -0,0 +1,137 @@
+from sklearn.decomposition import PCA
+from sklearn.feature_extraction.text import TfidfVectorizer
+
+import hashlib
+import logging
+import numpy as np
+import os
+import os.path
+import sys
+import tarfile
+import wget
+
+
+DATA_URL = "http://kdd.ics.uci.edu/databases/20newsgroups/20_newsgroups.tar.gz"
+DATA_FILE = "20_newsgroups.tar.gz"
+DATA_SHA256 = "b7bbf82b7831f7dbb1a09d9312f66fa78565c8de25526999b0d66f69d37e414"
+
+
+def build_topic_corpus(corpus_file, n, topic):
+ logging.info("Extracting corpus for topic '{}'".format(topic))
+ topic_items = []
+ names = corpus_file.getnames()
+ for name in names:
+ if topic in name:
+ ti = corpus_file.getmember(name)
+ if ti.isfile():
+ topic_items.append(name)
+ if len(topic_items) == 0:
+ # Topic does not exist (no items fetched)
+ raise ValueError(topic)
+
+ topic_ids = []
+ topic_corpus = []
+ indices = np.arange(len(topic_items))
+ np.random.shuffle(indices)
+ indices = indices[:n]
+ for i in indices:
+ ti = corpus_file.getmember(topic_items[i])
+ with corpus_file.extractfile(ti) as f:
+ try:
+ contents = str(f.read(), encoding="utf8")
+ except ValueError as e:
+ logging.warn("Encoding error in '{}': {}".format(ti.name, e))
+ continue
+ _, item_id = os.path.split(ti.name)
+ topic_ids.append(item_id)
+ topic_corpus.append(contents)
+
+ return topic_ids, topic_corpus
+
+
+def build_corpus(n, topics):
+ """
+ Builds a corpus with each topic, with N items each.
+ Returns a list of document IDs and a corpus which is a dict where each topic
+ is a key mapped to a list of document contents.
+ """
+ ids = []
+ corpus = dict()
+ with tarfile.open(DATA_FILE, "r:gz") as f:
+ for topic in topics:
+ topic_ids, topic_corpus = build_topic_corpus(f, n, topic)
+ corpus[topic] = topic_corpus
+ ids.extend(topic_ids)
+ return ids, corpus
+
+
+if __name__ == "__main__":
+ if len(sys.argv) < 4:
+ print("usage: {} STOP_WORDS N TOPIC [ TOPIC [ ... ] ]".format(sys.argv[0]))
+ print("The program reads the file STOP_WORDS for stop words, extracts"
+ + " and generates a BoW model from N random articles of each TOPIC")
+ exit(1)
+
+ logging.basicConfig(filename="newsgroups_extract.log",
+ format="%(levelname)s:%(message)s",
+ level=logging.INFO)
+
+ if not os.path.exists(DATA_FILE):
+ logging.info("Downloading data from '{}'".format(DATA_URL))
+ wget.download(DATA_URL, DATA_FILE)
+ with open(DATA_FILE, "rb") as f:
+ if not hashlib.sha256(f.read()).hexdigest() != DATA_SHA256:
+ logging.error("'{}' is corrupted; aborting".format(DATA_FILE))
+ exit(1)
+
+ # Read stop words list
+ try:
+ with open(sys.argv[1]) as stop_words_file:
+ stop_words = stop_words_file.read().split()
+ except Exception as e:
+ logging.error("Could not read stop words: {}".format(e))
+ exit(1)
+
+ try:
+ n = int(sys.argv[2])
+ if (n < 2) or (n > 1000):
+ raise ValueError("N must be between 2 and 1000")
+ except ValueError as e:
+ logging.error("Invalid argument: {}".format(e))
+ exit(1)
+
+ # Extract text corpus from tarball
+ logging.info("Building corpus")
+ topics = sys.argv[3:]
+ try:
+ ids, corpus = build_corpus(n, topics)
+ except ValueError as e:
+ logging.error("Invalid topic: {}".format(e))
+ exit(1)
+
+ corpus_text = []
+ for topic_items in corpus.values():
+ corpus_text.extend(topic_items)
+
+ # Compute the TF-IDF matrix
+ logging.info("Computing TF-IDF matrix")
+ vectorizer = TfidfVectorizer(min_df=0.01, stop_words=stop_words)
+ X = vectorizer.fit_transform(corpus_text)
+
+ # Reduce data dimensionality using PCA
+ logging.info("Computing PCA and reducing to 512 dimensions")
+ X = PCA(n_components=512, whiten=True).fit_transform(X.toarray())
+
+ # Save all extracted features and related data
+ logging.info("Writing IDs file")
+ ids_fname = "newsgroups-{}-{}.ids".format(n, len(topics))
+ np.savetxt(ids_fname, ids, fmt="%s")
+
+ logging.info("Writing table file")
+ tbl_fname = "newsgroups-{}-{}.tbl".format(n, len(topics))
+ np.savetxt(tbl_fname, X.todense(), fmt="%f")
+
+ logging.info("Writing labels file")
+ labels_fname = "newsgroups-{}-{}.labels".format(n, len(topics))
+ counts = [len(topic_items) for topic_items in corpus.values()]
+ np.savetxt(labels_fname, np.repeat(topics, counts), fmt="%s")
diff --git a/datasets/newsgroups/source b/datasets/newsgroups/source
new file mode 100644
index 0000000..764f792
--- /dev/null
+++ b/datasets/newsgroups/source
@@ -0,0 +1 @@
+http://kdd.ics.uci.edu/databases/20newsgroups/20newsgroups.html
diff --git a/datasets/newsgroups/stop.sh b/datasets/newsgroups/stop.sh
new file mode 100644
index 0000000..36a5f74
--- /dev/null
+++ b/datasets/newsgroups/stop.sh
@@ -0,0 +1,12 @@
+# stop.sh
+#
+# Generate proper stop words list from the 'stop.txt' file.
+
+
+# Original source: http://snowball.tartarus.org/algorithms/english/stop.txt
+# NOTE: in our experiments, stop.txt has been modified to include the last stop
+# words (stop.txt is included).
+
+sed 's/|.*//g' <stop.txt \
+ | sed 's/ \+//g' \
+ | sed '/^$/d' >words.txt
diff --git a/datasets/newsgroups/stop.txt b/datasets/newsgroups/stop.txt
new file mode 100644
index 0000000..5d0a34b
--- /dev/null
+++ b/datasets/newsgroups/stop.txt
@@ -0,0 +1,310 @@
+
+ | An English stop word list. Comments begin with vertical bar. Each stop
+ | word is at the start of a line.
+
+ | Many of the forms below are quite rare (e.g. "yourselves") but included for
+ | completeness.
+
+ | PRONOUNS FORMS
+ | 1st person sing
+
+i | subject, always in upper case of course
+
+me | object
+my | possessive adjective
+ | the possessive pronoun `mine' is best suppressed, because of the
+ | sense of coal-mine etc.
+myself | reflexive
+ | 1st person plural
+we | subject
+
+| us | object
+ | care is required here because US = United States. It is usually
+ | safe to remove it if it is in lower case.
+our | possessive adjective
+ours | possessive pronoun
+ourselves | reflexive
+ | second person (archaic `thou' forms not included)
+you | subject and object
+your | possessive adjective
+yours | possessive pronoun
+yourself | reflexive (singular)
+yourselves | reflexive (plural)
+ | third person singular
+he | subject
+him | object
+his | possessive adjective and pronoun
+himself | reflexive
+
+she | subject
+her | object and possessive adjective
+hers | possessive pronoun
+herself | reflexive
+
+it | subject and object
+its | possessive adjective
+itself | reflexive
+ | third person plural
+they | subject
+them | object
+their | possessive adjective
+theirs | possessive pronoun
+themselves | reflexive
+ | other forms (demonstratives, interrogatives)
+what
+which
+who
+whom
+this
+that
+these
+those
+
+ | VERB FORMS (using F.R. Palmer's nomenclature)
+ | BE
+am | 1st person, present
+is | -s form (3rd person, present)
+are | present
+was | 1st person, past
+were | past
+be | infinitive
+been | past participle
+being | -ing form
+ | HAVE
+have | simple
+has | -s form
+had | past
+having | -ing form
+ | DO
+do | simple
+does | -s form
+did | past
+doing | -ing form
+
+ | The forms below are, I believe, best omitted, because of the significant
+ | homonym forms:
+
+ | He made a WILL
+ | old tin CAN
+ | merry month of MAY
+ | a smell of MUST
+ | fight the good fight with all thy MIGHT
+
+ | would, could, should, ought might however be included
+
+ | | AUXILIARIES
+ | | WILL
+ |will
+
+would
+
+ | | SHALL
+ |shall
+
+should
+
+ | | CAN
+ |can
+
+could
+
+ | | MAY
+ |may
+ |might
+ | | MUST
+ |must
+ | | OUGHT
+
+ought
+
+ | COMPOUND FORMS, increasingly encountered nowadays in 'formal' writing
+ | pronoun + verb
+
+i'm
+you're
+he's
+she's
+it's
+we're
+they're
+i've
+you've
+we've
+they've
+i'd
+you'd
+he'd
+she'd
+we'd
+they'd
+i'll
+you'll
+he'll
+she'll
+we'll
+they'll
+
+ | verb + negation
+
+isn't
+aren't
+wasn't
+weren't
+hasn't
+haven't
+hadn't
+doesn't
+don't
+didn't
+
+ | auxiliary + negation
+
+won't
+wouldn't
+shan't
+shouldn't
+can't
+cannot
+couldn't
+mustn't
+
+ | miscellaneous forms
+
+let's
+that's
+who's
+what's
+here's
+there's
+when's
+where's
+why's
+how's
+
+ | rarer forms
+
+ | daren't needn't
+
+ | doubtful forms
+
+ | oughtn't mightn't
+
+ | ARTICLES
+a
+an
+the
+
+ | THE REST (Overlap among prepositions, conjunctions, adverbs etc is so
+ | high, that classification is pointless.)
+and
+but
+if
+or
+because
+as
+until
+while
+
+of
+at
+by
+for
+with
+about
+against
+between
+into
+through
+during
+before
+after
+above
+below
+to
+from
+up
+down
+in
+out
+on
+off
+over
+under
+
+again
+further
+then
+once
+
+here
+there
+when
+where
+why
+how
+
+all
+any
+both
+each
+few
+more
+most
+other
+some
+such
+
+no
+nor
+not
+only
+own
+same
+so
+than
+too
+very
+
+one
+every
+least
+less
+many
+now
+ever
+never
+say
+says
+said
+also
+get
+go
+goes
+just
+made
+make
+put
+see
+seen
+whether
+like
+well
+back
+even
+still
+way
+take
+since
+another
+however
+two
+three
+four
+five
+first
+second
+new
+old
+high
+long
+
diff --git a/datasets/newsgroups/words.txt b/datasets/newsgroups/words.txt
new file mode 100644
index 0000000..0d11300
--- /dev/null
+++ b/datasets/newsgroups/words.txt
@@ -0,0 +1,216 @@
+i
+me
+my
+myself
+we
+our
+ours
+ourselves
+you
+your
+yours
+yourself
+yourselves
+he
+him
+his
+himself
+she
+her
+hers
+herself
+it
+its
+itself
+they
+them
+their
+theirs
+themselves
+what
+which
+who
+whom
+this
+that
+these
+those
+am
+is
+are
+was
+were
+be
+been
+being
+have
+has
+had
+having
+do
+does
+did
+doing
+would
+should
+could
+ought
+i'm
+you're
+he's
+she's
+it's
+we're
+they're
+i've
+you've
+we've
+they've
+i'd
+you'd
+he'd
+she'd
+we'd
+they'd
+i'll
+you'll
+he'll
+she'll
+we'll
+they'll
+isn't
+aren't
+wasn't
+weren't
+hasn't
+haven't
+hadn't
+doesn't
+don't
+didn't
+won't
+wouldn't
+shan't
+shouldn't
+can't
+cannot
+couldn't
+mustn't
+let's
+that's
+who's
+what's
+here's
+there's
+when's
+where's
+why's
+how's
+a
+an
+the
+and
+but
+if
+or
+because
+as
+until
+while
+of
+at
+by
+for
+with
+about
+against
+between
+into
+through
+during
+before
+after
+above
+below
+to
+from
+up
+down
+in
+out
+on
+off
+over
+under
+again
+further
+then
+once
+here
+there
+when
+where
+why
+how
+all
+any
+both
+each
+few
+more
+most
+other
+some
+such
+no
+nor
+not
+only
+own
+same
+so
+than
+too
+very
+one
+every
+least
+less
+many
+now
+ever
+never
+say
+says
+said
+also
+get
+go
+goes
+just
+made
+make
+put
+see
+seen
+whether
+like
+well
+back
+even
+still
+way
+take
+since
+another
+however
+two
+three
+four
+five
+first
+second
+new
+old
+high
+long