Bash Crude word frequency analysis on a text file
Bash
Download (.zip)
#!/bin/bash # wf.sh: Crude word frequency analysis on a text file. # This is a more efficient version of the "wf2.sh" script.
# Check for input file on command line. ARGS=1 E_BADARGS=65 E_NOFILE=66
if [ $# -ne "$ARGS" ] # Correct number of arguments passed to script? then echo "Usage: `basename $0` filename" exit $E_BADARGS fi
if [ ! -f "$1" ] # Check if file exists. then echo "File \"$1\" does not exist." exit $E_NOFILE fi
######################################################## # main () sed -e 's/\.//g' -e 's/\,//g' -e 's/ /\ /g' "$1" | tr 'A-Z' 'a-z' | sort | uniq -c | sort -nr # ========================= # Frequency of occurrence
# Filter out periods and commas, and #+ change space between words to linefeed, #+ then shift characters to lowercase, and #+ finally prefix occurrence count and sort numerically. ########################################################
# Exercises: # --------- # 1) Add 'sed' commands to filter out other punctuation, #+ such as semicolons. # 2) Modify to also filter out multiple spaces and other whitespace. # 3) Add a secondary sort key, so that instances of equal occurrence #+ are sorted alphabetically.
exit 0
|