#!/bin/sh
sh words.sh $1 | uniq -c | while read count word; do
    base=`echo $word | sed 's/-//' | iconv -f UTF8 -t ASCII//TRANSLIT`
    echo $base $word $count
done | sort > wordfreq.tmp
echo -n '' > wordfreq.sed
while read bad_word; do
    fgrep " $bad_word " wordfreq.tmp | while read base junk; do
	words=`fgrep -w $base wordfreq.tmp | while read base word junk; do
	    if fgrep -x $word wordfreq.bad_words > /dev/null; then
		:
	    else
		echo $word
	    fi
	done`
	case $words in
	    *\ *)
		echo "Add all but one of $words to wordfreq.bad_words"
		exit 1
		;;
	    "")
	    	echo "No valid replacement for $bad_word"
		exit 1
		;;
	    *)
	    	pattern=`echo $bad_word | sh escape.sh`
		echo "s/\<$pattern\>/<corr was=\"&\">$words<\/corr>/" \
		  >> wordfreq.sed
		;;
	esac
    done
done < wordfreq.bad_words
sed -f wordfreq.sed $1 > $2
while read base word count; do
    replacement=`echo $word | sed -f wordfreq.sed`
    echo $base $word $count $replacement
done < wordfreq.tmp | awk '{ \
    if ($1 == prev_base) { \
	if ($1 != base) { \
	    base = $1; \
	    print "Base " $1; \
	    if (prev_replacement != prev_word) \
		print "    " prev_word " " prev_count \
		  " (replaced by " prev_replacement ")"; \
	    else \
		print "    " prev_word " " prev_count; \
	} \
	if ($4 != $2) \
	    print "    " $2 " " $3 " (replaced by " $4 ")"; \
	else \
	    print "    " $2 " " $3; \
    } \
    prev_base = $1; \
    prev_word = $2; \
    prev_count = $3; \
    prev_replacement = $4; \
}' > wordfreq.log
