aboutsummaryrefslogtreecommitdiff
# stop.sh
#
# Generate proper stop words list from the 'stop.txt' file.


# Original source: http://snowball.tartarus.org/algorithms/english/stop.txt
# NOTE: in our experiments, stop.txt has been modified to include the last stop
# words (stop.txt is included).

sed 's/|.*//g' <stop.txt \
    | sed 's/ \+//g' \
    | sed '/^$/d' >words.txt