aboutsummaryrefslogtreecommitdiff
path: root/datasets/newsgroups/stop.sh
diff options
context:
space:
mode:
Diffstat (limited to 'datasets/newsgroups/stop.sh')
-rw-r--r--datasets/newsgroups/stop.sh12
1 files changed, 12 insertions, 0 deletions
diff --git a/datasets/newsgroups/stop.sh b/datasets/newsgroups/stop.sh
new file mode 100644
index 0000000..36a5f74
--- /dev/null
+++ b/datasets/newsgroups/stop.sh
@@ -0,0 +1,12 @@
+# stop.sh
+#
+# Generate proper stop words list from the 'stop.txt' file.
+
+
+# Original source: http://snowball.tartarus.org/algorithms/english/stop.txt
+# NOTE: in our experiments, stop.txt has been modified to include the last stop
+# words (stop.txt is included).
+
+sed 's/|.*//g' <stop.txt \
+ | sed 's/ \+//g' \
+ | sed '/^$/d' >words.txt