From b255338295587246292dc978e7d4d5687ee01fb4 Mon Sep 17 00:00:00 2001 From: Samuel Fadel Date: Fri, 19 Aug 2016 14:20:57 -0300 Subject: Scripts and other files for building all datasets. --- datasets/newsgroups/README | 7 + datasets/newsgroups/newsgroups-500-3.ids | 1493 +++++++++++++++++++++++++++++ datasets/newsgroups/newsgroups_extract.py | 137 +++ datasets/newsgroups/source | 1 + datasets/newsgroups/stop.sh | 12 + datasets/newsgroups/stop.txt | 310 ++++++ datasets/newsgroups/words.txt | 216 +++++ 7 files changed, 2176 insertions(+) create mode 100644 datasets/newsgroups/README create mode 100644 datasets/newsgroups/newsgroups-500-3.ids create mode 100644 datasets/newsgroups/newsgroups_extract.py create mode 100644 datasets/newsgroups/source create mode 100644 datasets/newsgroups/stop.sh create mode 100644 datasets/newsgroups/stop.txt create mode 100644 datasets/newsgroups/words.txt (limited to 'datasets/newsgroups') diff --git a/datasets/newsgroups/README b/datasets/newsgroups/README new file mode 100644 index 0000000..78e92ee --- /dev/null +++ b/datasets/newsgroups/README @@ -0,0 +1,7 @@ +For running the script newsgroups_extract.py we used N = 500 and topics are +'comp.graphics', 'misc.forsale', and 'sci.med'. + +The randomly generated ids in our case are in the file 'newsgroups-500-3.ids'. + +Before running the script, be sure to generate the stopwords file using +'stop.sh'. diff --git a/datasets/newsgroups/newsgroups-500-3.ids b/datasets/newsgroups/newsgroups-500-3.ids new file mode 100644 index 0000000..9b468bd --- /dev/null +++ b/datasets/newsgroups/newsgroups-500-3.ids @@ -0,0 +1,1493 @@ +38998 +38380 +38224 +38260 +38642 +38718 +38728 +39495 +38338 +38696 +38984 +39068 +39665 +38657 +38944 +38811 +38421 +38324 +38580 +38430 +38541 +37950 +38483 +37940 +38794 +37913 +39053 +38412 +38916 +39677 +39644 +38502 +38960 +38594 +39022 +38777 +38855 +38865 +38909 +38511 +38831 +38813 +38522 +38558 +38504 +38273 +37947 +38802 +38354 +38639 +39016 +38808 +38608 +38772 +38673 +38928 +38452 +38887 +38795 +38864 +38484 +38305 +38975 +38833 +37918 +38856 +38344 +37936 +39670 +38776 +38547 +39063 +38731 +38353 +39064 +38760 +38220 +38231 +38459 +38889 +39072 +38878 +37958 +38278 +39642 +39496 +38962 +38948 +37932 +38700 +38963 +38393 +38405 +38753 +39035 +39671 +38445 +39012 +38896 +38787 +38839 +38259 +38560 +38803 +38751 +38613 +38595 +38822 +38610 +38996 +38945 +38845 +38933 +38530 +38474 +38981 +38265 +37935 +37961 +38650 +38599 +38486 +38229 +38364 +39002 +38321 +38299 +38645 +38505 +39031 +39061 +38882 +38921 +38747 +39017 +38913 +38559 +39041 +39030 +38535 +38911 +38809 +38296 +39624 +38397 +38233 +38345 +38458 +38871 +38854 +38255 +39034 +38375 +38872 +38791 +38232 +38646 +38544 +38314 +38980 +38286 +38716 +38720 +38997 +38792 +38886 +38714 +38605 +38685 +38769 +38899 +38582 +38736 +38799 +38683 +39623 +38306 +38551 +38652 +38532 +39067 +38986 +38804 +38668 +38626 +38274 +38373 +38999 +38644 +38498 +38978 +38834 +39622 +38357 +38262 +38287 +37943 +38991 +38322 +38313 +38492 +37948 +39620 +38550 +38327 +38816 +38609 +39011 +38542 +38835 +39647 +38763 +38715 +39656 +38987 +38239 +38587 +38507 +38627 +39636 +38623 +38957 +38451 +39650 +39049 +39669 +39039 +38968 +38399 +37945 +38312 +39059 +39019 +38693 +38369 +38801 +38477 +39044 +38525 +38641 +39640 +38733 +38702 +39037 +38234 +38294 +38332 +37952 +38466 +37931 +38443 +40027 +39004 +38619 +38298 +38325 +38348 +38807 +38455 +38876 +38869 +38767 +38800 +38863 +38555 +38439 +38625 +38337 +38284 +38840 +38764 +38914 +38361 +38884 +38765 +38904 +38493 +38310 +38953 +38584 +37938 +38972 +38471 +38655 +38995 +38225 +38925 +38930 +38713 +38958 +38531 +38628 +39490 +38330 +39673 +39010 +38476 +38758 +38832 +38651 +38276 +39637 +38433 +38588 +38669 +38223 +38311 +38387 +38721 +39079 +38881 +38852 +39737 +39488 +38723 +38251 +38597 +38389 +37914 +37930 +38554 +38830 +38724 +38741 +38867 +38947 +38745 +38977 +37925 +39626 +39066 +38413 +37957 +38617 +38690 +37939 +38496 +38464 +38221 +38748 +38775 +38618 +38942 +38632 +38257 +38612 +38927 +38331 +38382 +38631 +38365 +38660 +38729 +38398 +38826 +38691 +38407 +38812 +38253 +39006 +38342 +39634 +37953 +38759 +38810 +39062 +39076 +37921 +38261 +38434 +39054 +39615 +38381 +39047 +38782 +38280 +38992 +39661 +39084 +39014 +38424 +38779 +38749 +38983 +38637 +38709 +38686 +39027 +38323 +37962 +38923 +38228 +38931 +37916 +38099 +38727 +37942 +38844 +38966 +38843 +38746 +38347 +38317 +39040 +38708 +38790 +38665 +39075 +38938 +38725 +37924 +38937 +38437 +38674 +38692 +38527 +38400 +39638 +38562 +38744 +38526 +39042 +38340 +38902 +38825 +39069 +38570 +38362 +38564 +38275 +38903 +38403 +38581 +38536 +38567 +38377 +38781 +38906 +38328 +38216 +38624 +38293 +38240 +38545 +38635 +38891 +38877 +38463 +38658 +38994 +39020 +38890 +38850 +38654 +39668 +38814 +38515 +39023 +38292 +39643 +38666 +38352 +37963 +38719 +38319 +38411 +39676 +37922 +38303 +38436 +38244 +38820 +38621 +38680 +38773 +38687 +38450 +38252 +37261 +38269 +38444 +38989 +39655 +38579 +38552 +38670 +38320 +38533 +38620 +39001 +38846 +38734 +38241 +38238 +39080 +38429 +38499 +38982 +38277 +39057 +59193 +58141 +59454 +59029 +59020 +59187 +59453 +59121 +59531 +59581 +59514 +59533 +59622 +58050 +59015 +58114 +58799 +59297 +58092 +58065 +59388 +59603 +59069 +59640 +59580 +58997 +59021 +59381 +59604 +59119 +59353 +59318 +58918 +59480 +59234 +58866 +58113 +59341 +59136 +59635 +59301 +58053 +58981 +58778 +59415 +58940 +59007 +58048 +59006 +59028 +59396 +58796 +59455 +59001 +59062 +59180 +59436 +59463 +59299 +59075 +59320 +59236 +58118 +58836 +59628 +58931 +58943 +59158 +58963 +59345 +58978 +59265 +58851 +59024 +59063 +58111 +59254 +58830 +58055 +59197 +58824 +59014 +59190 +58785 +58071 +58897 +59131 +59639 +58955 +59044 +59092 +59224 +58093 +59225 +59334 +59200 +59369 +58774 +58797 +59220 +59059 +58129 +59351 +59192 +58133 +59491 +58898 +59145 +59448 +58869 +59198 +58578 +59270 +58760 +59241 +59085 +58794 +58854 +59404 +58790 +58127 +59417 +59250 +58888 +58881 +59571 +59434 +59539 +59025 +59642 +58817 +59129 +58989 +59115 +58841 +59372 +59058 +58098 +59641 +59594 +59401 +59142 +58967 +58877 +59513 +58954 +58070 +59469 +59385 +59357 +59368 +59595 +58942 +59041 +59364 +59356 +59052 +59358 +58947 +59082 +59346 +59430 +58867 +59451 +58973 +58977 +59228 +58770 +59153 +59522 +59347 +59185 +59556 +59414 +58970 +59100 +59287 +58878 +58904 +59602 +59002 +59428 +59498 +58112 +58153 +58052 +58064 +59573 +59507 +59138 +59196 +58789 +58910 +59481 +58951 +58577 +59515 +59040 +58798 +59591 +58884 +59077 +59398 +59336 +59268 +59378 +59140 +58920 +58091 +59487 +59575 +59027 +59510 +59425 +59540 +59276 +59079 +59106 +58938 +59047 +59127 +59303 +59344 +59638 +58807 +58969 +59389 +58825 +59034 +59512 +59326 +59122 +59629 +59419 +59033 +58155 +58919 +59205 +58862 +59493 +58808 +58121 +58966 +59496 +59380 +59506 +59148 +58968 +59291 +58109 +59476 +58894 +58145 +58883 +59625 +58054 +59468 +58140 +58900 +59212 +58932 +59111 +59466 +59026 +59076 +59202 +59049 +58139 +59211 +58956 +59097 +59505 +59472 +59046 +59517 +59072 +59199 +59030 +59373 +59342 +59156 +59444 +58934 +59500 +58096 +59330 +58082 +59400 +59309 +59605 +59548 +59474 +58833 +59566 +59070 +58056 +59157 +59537 +58569 +58103 +59518 +59288 +59273 +59132 +58944 +58783 +58777 +58926 +58084 +58820 +59606 +58046 +59402 +58911 +59008 +59362 +59160 +58154 +58840 +59172 +59178 +58764 +59210 +59061 +59176 +58804 +59088 +58110 +59023 +58834 +59449 +58871 +58946 +59286 +59031 +59327 +59627 +58902 +59619 +58142 +58852 +58806 +59048 +58868 +58810 +59585 +59379 +59329 +58964 +58803 +59467 +59460 +59089 +59139 +59446 +58896 +59587 +59154 +59532 +59482 +59599 +59504 +58099 +59264 +58809 +59054 +58045 +58787 +59223 +59624 +59179 +59393 +59471 +59313 +59081 +58985 +59435 +58126 +59376 +59253 +59144 +58800 +59339 +58846 +59169 +59509 +59564 +59528 +59354 +59577 +59134 +59159 +58849 +58839 +59123 +58152 +59203 +59516 +58568 +58843 +59439 +59488 +59281 +59382 +59343 +59167 +58850 +59112 +58857 +59011 +59102 +58150 +59569 +58078 +58057 +59257 +58801 +59275 +58079 +59305 +59283 +58979 +58971 +59394 +59258 +59238 +58873 +59597 +59151 +59416 +58991 +58061 +59440 +59022 +59289 +59464 +58922 +59411 +58786 +59352 +58876 +58097 +59530 +58811 +58819 +58782 +58791 +58909 +59308 +58100 +59280 +58072 +59328 +58829 +58086 +58880 +59242 +59319 +58831 +59064 +59039 +58805 +59590 +59547 +59091 +58123 +59623 +59584 +59080 +59243 +59399 +58905 +58832 +59648 +58950 +58813 +59244 +59226 +59208 +59101 +59003 +58802 +58769 +58891 +59422 +59104 +59045 +58089 +59125 +58766 +59544 +59095 +59568 +59479 +59427 +58107 +59099 +76462 +76866 +76902 +76127 +75855 +76143 +76013 +74765 +76477 +76491 +76528 +76247 +75893 +76494 +76594 +76111 +75881 +75987 +76497 +76248 +74763 +75892 +76343 +76115 +75843 +76001 +75884 +74732 +76054 +76263 +76396 +76517 +76074 +76086 +76165 +76808 +76596 +75920 +76305 +75873 +76507 +76936 +76292 +76077 +76571 +75903 +74778 +76200 +76011 +75978 +76043 +76505 +76819 +74758 +76085 +76137 +75986 +76856 +76091 +76666 +76339 +76344 +76833 +76347 +75993 +76785 +76169 +76644 +75898 +76863 +76504 +76789 +75944 +75848 +75849 +75941 +76816 +76476 +76008 +76322 +76470 +76755 +74757 +76672 +75945 +74818 +75951 +76160 +76194 +76044 +75851 +75868 +75902 +74734 +74819 +75852 +76677 +76927 +76087 +76445 +76342 +75933 +76363 +75984 +76084 +76014 +76082 +76420 +76678 +74735 +74773 +76238 +76804 +75889 +75869 +76117 +76052 +76639 +76098 +76102 +76475 +76312 +76070 +76661 +76119 +74739 +76291 +74794 +76093 +76217 +76259 +76245 +76360 +75926 +75942 +74795 +76467 +76302 +76126 +76640 +74725 +76276 +76662 +76473 +75896 +74806 +75930 +75916 +76586 +74820 +76432 +75911 +76063 +74744 +76224 +76056 +75983 +76213 +76496 +76189 +76834 +74729 +76679 +76813 +76488 +76274 +76072 +76794 +75862 +76880 +76850 +76021 +76944 +74746 +74830 +75888 +76367 +75953 +75969 +76003 +74807 +76825 +76848 +76406 +76826 +76050 +76176 +76334 +76308 +76311 +76250 +76882 +76231 +75982 +76482 +75980 +75918 +76329 +75905 +76664 +76298 +76490 +76188 +75872 +76215 +76353 +76458 +76521 +76538 +76840 +76453 +76134 +74769 +76421 +76057 +76288 +76065 +76831 +76704 +76624 +76434 +76567 +76253 +76107 +74730 +76824 +76149 +76184 +74814 +76258 +76252 +76139 +76278 +76812 +76506 +76537 +76170 +76835 +76096 +75971 +74828 +76757 +76845 +75904 +75051 +76140 +76486 +76459 +74792 +76362 +76364 +77056 +74781 +76359 +74776 +76046 +76281 +76844 +76516 +74743 +76261 +76381 +74740 +76108 +76132 +76378 +76405 +76161 +76608 +76133 +76275 +76783 +74741 +76198 +76435 +75854 +76606 +76301 +76485 +74768 +76038 +75997 +76837 +76851 +76860 +75989 +76414 +76350 +76852 +76214 +76030 +74774 +76481 +75947 +76446 +76124 +76512 +76151 +76318 +76089 +76556 +76478 +76118 +76180 +74824 +74150 +76075 +76101 +76781 +76418 +76500 +76349 +76182 +76881 +76024 +76039 +76450 +76045 +76109 +76489 +76079 +76809 +76939 +74772 +76510 +76658 +76069 +76601 +75912 +76438 +76691 +76078 +76270 +76447 +76461 +76861 +75891 +76437 +76314 +74798 +76254 +76192 +76836 +76240 +76878 +74727 +76779 +74761 +76148 +76522 +75846 +76417 +75876 +76141 +76233 +76425 +76547 +76940 +76800 +76357 +76212 +76328 +76827 +76650 +76558 +76649 +76372 +76251 +76655 +74810 +76416 +75929 +75948 +76681 +75908 +76842 +75927 +75940 +75842 +75864 +76935 +75994 +76832 +76209 +76310 +76597 +75906 +76659 +76671 +76035 +76391 +74808 +76868 +76591 +74797 +76455 +76546 +76853 +76120 +76053 +76273 +76144 +76092 +75958 +76460 +76879 +76463 +76479 +75860 +75992 +76792 +75955 +76574 +76515 +75962 +76123 +76648 +76317 +74720 +74826 +76685 +76348 +75946 +76220 +76563 +76204 +76578 +76855 +76048 +76088 +76060 +76025 +76553 +75883 +76227 +75961 +76508 +76145 +76793 +76181 +74785 +74721 +76193 +76041 +76171 +76638 +76210 +76645 +76256 +75899 +76099 +76823 +75981 +74752 +76673 +76095 +76324 +75879 +75895 +76131 +76780 +76229 +76820 +74726 +76023 +76299 +76796 +76152 +76599 +76354 +76575 +76456 +76419 +76572 +76545 +76603 +76287 +76600 +76315 +76647 +76684 +76172 +76177 +74804 +76665 +76340 +77014 +76094 +76582 +76338 +74829 +74793 +76174 +76199 +74736 +76570 diff --git a/datasets/newsgroups/newsgroups_extract.py b/datasets/newsgroups/newsgroups_extract.py new file mode 100644 index 0000000..51c5030 --- /dev/null +++ b/datasets/newsgroups/newsgroups_extract.py @@ -0,0 +1,137 @@ +from sklearn.decomposition import PCA +from sklearn.feature_extraction.text import TfidfVectorizer + +import hashlib +import logging +import numpy as np +import os +import os.path +import sys +import tarfile +import wget + + +DATA_URL = "http://kdd.ics.uci.edu/databases/20newsgroups/20_newsgroups.tar.gz" +DATA_FILE = "20_newsgroups.tar.gz" +DATA_SHA256 = "b7bbf82b7831f7dbb1a09d9312f66fa78565c8de25526999b0d66f69d37e414" + + +def build_topic_corpus(corpus_file, n, topic): + logging.info("Extracting corpus for topic '{}'".format(topic)) + topic_items = [] + names = corpus_file.getnames() + for name in names: + if topic in name: + ti = corpus_file.getmember(name) + if ti.isfile(): + topic_items.append(name) + if len(topic_items) == 0: + # Topic does not exist (no items fetched) + raise ValueError(topic) + + topic_ids = [] + topic_corpus = [] + indices = np.arange(len(topic_items)) + np.random.shuffle(indices) + indices = indices[:n] + for i in indices: + ti = corpus_file.getmember(topic_items[i]) + with corpus_file.extractfile(ti) as f: + try: + contents = str(f.read(), encoding="utf8") + except ValueError as e: + logging.warn("Encoding error in '{}': {}".format(ti.name, e)) + continue + _, item_id = os.path.split(ti.name) + topic_ids.append(item_id) + topic_corpus.append(contents) + + return topic_ids, topic_corpus + + +def build_corpus(n, topics): + """ + Builds a corpus with each topic, with N items each. + Returns a list of document IDs and a corpus which is a dict where each topic + is a key mapped to a list of document contents. + """ + ids = [] + corpus = dict() + with tarfile.open(DATA_FILE, "r:gz") as f: + for topic in topics: + topic_ids, topic_corpus = build_topic_corpus(f, n, topic) + corpus[topic] = topic_corpus + ids.extend(topic_ids) + return ids, corpus + + +if __name__ == "__main__": + if len(sys.argv) < 4: + print("usage: {} STOP_WORDS N TOPIC [ TOPIC [ ... ] ]".format(sys.argv[0])) + print("The program reads the file STOP_WORDS for stop words, extracts" + + " and generates a BoW model from N random articles of each TOPIC") + exit(1) + + logging.basicConfig(filename="newsgroups_extract.log", + format="%(levelname)s:%(message)s", + level=logging.INFO) + + if not os.path.exists(DATA_FILE): + logging.info("Downloading data from '{}'".format(DATA_URL)) + wget.download(DATA_URL, DATA_FILE) + with open(DATA_FILE, "rb") as f: + if not hashlib.sha256(f.read()).hexdigest() != DATA_SHA256: + logging.error("'{}' is corrupted; aborting".format(DATA_FILE)) + exit(1) + + # Read stop words list + try: + with open(sys.argv[1]) as stop_words_file: + stop_words = stop_words_file.read().split() + except Exception as e: + logging.error("Could not read stop words: {}".format(e)) + exit(1) + + try: + n = int(sys.argv[2]) + if (n < 2) or (n > 1000): + raise ValueError("N must be between 2 and 1000") + except ValueError as e: + logging.error("Invalid argument: {}".format(e)) + exit(1) + + # Extract text corpus from tarball + logging.info("Building corpus") + topics = sys.argv[3:] + try: + ids, corpus = build_corpus(n, topics) + except ValueError as e: + logging.error("Invalid topic: {}".format(e)) + exit(1) + + corpus_text = [] + for topic_items in corpus.values(): + corpus_text.extend(topic_items) + + # Compute the TF-IDF matrix + logging.info("Computing TF-IDF matrix") + vectorizer = TfidfVectorizer(min_df=0.01, stop_words=stop_words) + X = vectorizer.fit_transform(corpus_text) + + # Reduce data dimensionality using PCA + logging.info("Computing PCA and reducing to 512 dimensions") + X = PCA(n_components=512, whiten=True).fit_transform(X.toarray()) + + # Save all extracted features and related data + logging.info("Writing IDs file") + ids_fname = "newsgroups-{}-{}.ids".format(n, len(topics)) + np.savetxt(ids_fname, ids, fmt="%s") + + logging.info("Writing table file") + tbl_fname = "newsgroups-{}-{}.tbl".format(n, len(topics)) + np.savetxt(tbl_fname, X.todense(), fmt="%f") + + logging.info("Writing labels file") + labels_fname = "newsgroups-{}-{}.labels".format(n, len(topics)) + counts = [len(topic_items) for topic_items in corpus.values()] + np.savetxt(labels_fname, np.repeat(topics, counts), fmt="%s") diff --git a/datasets/newsgroups/source b/datasets/newsgroups/source new file mode 100644 index 0000000..764f792 --- /dev/null +++ b/datasets/newsgroups/source @@ -0,0 +1 @@ +http://kdd.ics.uci.edu/databases/20newsgroups/20newsgroups.html diff --git a/datasets/newsgroups/stop.sh b/datasets/newsgroups/stop.sh new file mode 100644 index 0000000..36a5f74 --- /dev/null +++ b/datasets/newsgroups/stop.sh @@ -0,0 +1,12 @@ +# stop.sh +# +# Generate proper stop words list from the 'stop.txt' file. + + +# Original source: http://snowball.tartarus.org/algorithms/english/stop.txt +# NOTE: in our experiments, stop.txt has been modified to include the last stop +# words (stop.txt is included). + +sed 's/|.*//g' words.txt diff --git a/datasets/newsgroups/stop.txt b/datasets/newsgroups/stop.txt new file mode 100644 index 0000000..5d0a34b --- /dev/null +++ b/datasets/newsgroups/stop.txt @@ -0,0 +1,310 @@ + + | An English stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | Many of the forms below are quite rare (e.g. "yourselves") but included for + | completeness. + + | PRONOUNS FORMS + | 1st person sing + +i | subject, always in upper case of course + +me | object +my | possessive adjective + | the possessive pronoun `mine' is best suppressed, because of the + | sense of coal-mine etc. +myself | reflexive + | 1st person plural +we | subject + +| us | object + | care is required here because US = United States. It is usually + | safe to remove it if it is in lower case. +our | possessive adjective +ours | possessive pronoun +ourselves | reflexive + | second person (archaic `thou' forms not included) +you | subject and object +your | possessive adjective +yours | possessive pronoun +yourself | reflexive (singular) +yourselves | reflexive (plural) + | third person singular +he | subject +him | object +his | possessive adjective and pronoun +himself | reflexive + +she | subject +her | object and possessive adjective +hers | possessive pronoun +herself | reflexive + +it | subject and object +its | possessive adjective +itself | reflexive + | third person plural +they | subject +them | object +their | possessive adjective +theirs | possessive pronoun +themselves | reflexive + | other forms (demonstratives, interrogatives) +what +which +who +whom +this +that +these +those + + | VERB FORMS (using F.R. Palmer's nomenclature) + | BE +am | 1st person, present +is | -s form (3rd person, present) +are | present +was | 1st person, past +were | past +be | infinitive +been | past participle +being | -ing form + | HAVE +have | simple +has | -s form +had | past +having | -ing form + | DO +do | simple +does | -s form +did | past +doing | -ing form + + | The forms below are, I believe, best omitted, because of the significant + | homonym forms: + + | He made a WILL + | old tin CAN + | merry month of MAY + | a smell of MUST + | fight the good fight with all thy MIGHT + + | would, could, should, ought might however be included + + | | AUXILIARIES + | | WILL + |will + +would + + | | SHALL + |shall + +should + + | | CAN + |can + +could + + | | MAY + |may + |might + | | MUST + |must + | | OUGHT + +ought + + | COMPOUND FORMS, increasingly encountered nowadays in 'formal' writing + | pronoun + verb + +i'm +you're +he's +she's +it's +we're +they're +i've +you've +we've +they've +i'd +you'd +he'd +she'd +we'd +they'd +i'll +you'll +he'll +she'll +we'll +they'll + + | verb + negation + +isn't +aren't +wasn't +weren't +hasn't +haven't +hadn't +doesn't +don't +didn't + + | auxiliary + negation + +won't +wouldn't +shan't +shouldn't +can't +cannot +couldn't +mustn't + + | miscellaneous forms + +let's +that's +who's +what's +here's +there's +when's +where's +why's +how's + + | rarer forms + + | daren't needn't + + | doubtful forms + + | oughtn't mightn't + + | ARTICLES +a +an +the + + | THE REST (Overlap among prepositions, conjunctions, adverbs etc is so + | high, that classification is pointless.) +and +but +if +or +because +as +until +while + +of +at +by +for +with +about +against +between +into +through +during +before +after +above +below +to +from +up +down +in +out +on +off +over +under + +again +further +then +once + +here +there +when +where +why +how + +all +any +both +each +few +more +most +other +some +such + +no +nor +not +only +own +same +so +than +too +very + +one +every +least +less +many +now +ever +never +say +says +said +also +get +go +goes +just +made +make +put +see +seen +whether +like +well +back +even +still +way +take +since +another +however +two +three +four +five +first +second +new +old +high +long + diff --git a/datasets/newsgroups/words.txt b/datasets/newsgroups/words.txt new file mode 100644 index 0000000..0d11300 --- /dev/null +++ b/datasets/newsgroups/words.txt @@ -0,0 +1,216 @@ +i +me +my +myself +we +our +ours +ourselves +you +your +yours +yourself +yourselves +he +him +his +himself +she +her +hers +herself +it +its +itself +they +them +their +theirs +themselves +what +which +who +whom +this +that +these +those +am +is +are +was +were +be +been +being +have +has +had +having +do +does +did +doing +would +should +could +ought +i'm +you're +he's +she's +it's +we're +they're +i've +you've +we've +they've +i'd +you'd +he'd +she'd +we'd +they'd +i'll +you'll +he'll +she'll +we'll +they'll +isn't +aren't +wasn't +weren't +hasn't +haven't +hadn't +doesn't +don't +didn't +won't +wouldn't +shan't +shouldn't +can't +cannot +couldn't +mustn't +let's +that's +who's +what's +here's +there's +when's +where's +why's +how's +a +an +the +and +but +if +or +because +as +until +while +of +at +by +for +with +about +against +between +into +through +during +before +after +above +below +to +from +up +down +in +out +on +off +over +under +again +further +then +once +here +there +when +where +why +how +all +any +both +each +few +more +most +other +some +such +no +nor +not +only +own +same +so +than +too +very +one +every +least +less +many +now +ever +never +say +says +said +also +get +go +goes +just +made +make +put +see +seen +whether +like +well +back +even +still +way +take +since +another +however +two +three +four +five +first +second +new +old +high +long -- cgit v1.2.3