aboutsummaryrefslogtreecommitdiff
path: root/datasets/newsgroups/stop.sh
blob: 36a5f7410ef6a1cd19c7c72b57088c32e53e5cc6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
# stop.sh
#
# Generate proper stop words list from the 'stop.txt' file.


# Original source: http://snowball.tartarus.org/algorithms/english/stop.txt
# NOTE: in our experiments, stop.txt has been modified to include the last stop
# words (stop.txt is included).

sed 's/|.*//g' <stop.txt \
    | sed 's/ \+//g' \
    | sed '/^$/d' >words.txt