#!/usr/local/bin/perl
#
#                                        David MacKay  2002
# processbook.p
#   - cleans out excess blank lines, without
# destroying paragraph structure
#
# Also turns all ". " into ". \n"
### Also turns all ", " into ", \n"
#
# also strips off the gutenberg header
#
# and replaces \w--\w by \w -- \w
#
# 
# example usage
# in /data/coll/djw30/carroll
# processbook.p alice30.txt > /data/coll/mackay/books/alice
# processbook.p  sbrun10.txt  > /data/coll/mackay/books/sbrun 
# processbook.p  snark12.txt  > /data/coll/mackay/books/snark
# cat /data/coll/djw30/books/oz/* |  processbook.p guten=0  > /data/coll/mackay/books/oz

# processbook.p /data/coll/djw30/books/swift/gltrv10.txt  > /data/coll/mackay/books/gltrv
# cat   /data/coll/djw30/books/copytales/* |   processbook.p guten=0  > /data/coll/mackay/books/tomswift
# cat /data/coll/djw30/input/script1.txt  |   processbook.p guten=0  > /data/coll/mackay/books/starwars
# cat /data/coll/djw30/input/khan.txt  |   processbook.p guten=0  > /data/coll/mackay/books/khan
# tail  -512 ~/dasher/dasher1.6.2/input/source |   processbook.p guten=0  > /data/coll/mackay/books/molerat
#    processbook.p guten=0  codingtheory  > /data/coll/mackay/books/codingtheory
#  processbook.p guten=0  /home/djw30/dasher/emma  > /data/coll/mackay/books/emma
# options
$guten = 1 ; # get rid of gutenberg header
$guten = 0 ; # 

eval "\$$1=\$2" while @ARGV && $ARGV[0]=~ /^(\w+)=(.*)/ && shift;

if ($guten) {
  GLOOP:
    while(<>) {
	if ( /\*END\*THE SMALL PRINT/ ) {
	    last GLOOP ;
	}
    }
}
@a = <> ;

$b = join ( '' , @a ) ;
# remove c-M
$b =~ s/\cM//g;
# assume blah--blah should be blah -- blah
$b =~ s/(\w)(\-\-)/$1 $2/g ;
$b =~ s/(\-\-)(\w)/$1 $2/g ;
# remove * * * 
$b =~ s/(\* \* )+/\* \* /g ;
#
# handle mr, mrs, etc
$b =~ s/Mr\./MRFULLSTOP/g ;
$b =~ s/Mrs\./MRSFULLSTOP/g ;
$b =~ s/Dr\./DRFULLSTOP/g ;
$b =~ s/Prof\./PROFFULLSTOP/g ;
$b =~ s/Pr\./PRFULLSTOP/g ;
$b =~ s/Sen\./SENFULLSTOP/g ;
#
# Remove new lines that are not associated with punctuation
$b =~ s/\n/ /g ; 

# I cut out "?!" because a lot of alice contains     ?' she said
$b =~ s/(\w[\)\"\']{0,2}[\.]{1,4}[\)\"\']{0,2})(\s)/$1 \n$2/g ;
# Need to spot the following capital letter.
$b =~ s/(\w[\)\"\']{0,2}[\!\?]{1,4}[\)\"\']{0,2})(\s+[\"\'\`]{0,2}[A-Z])/$1 \n$2/g ;
# $b =~ s/(\w\.)(\s)/$1 \n$2/g ;
# $b =~ s/(\w\,)(\s)/$1 \n$2/g ;
# $b =~ s/(\w\:)(\s)/$1 \n$2/g ;
# $b =~ s/(\w\;)(\s)/$1 \n$2/g ;
# $b =~ s/(\w\?)(\s)/$1 \n$2/g ;
# $b =~ s/(\w\!)(\s)/$1 \n$2/g ;
#
# Remove double-spaces
$b =~ s/[\t ][\t ]+/ /g;
# was
# $b =~ s/([^\n]*)\n([^\n]*)\n([^\n]*)\n/$1 $2 $3\n/g ;
# glue together SHORT sentences by sucking up to the next sentence.
$b =~ s/\n([^\n]{1,17})\n/\n$1 /g ;
# Remove double-spaces
$b =~ s/[\t ][\t ]+/ /g;
$b =~ s/\n[\t ]+/\n/g;
# handle mr, mrs, etc
$b =~ s/MRFULLSTOP/Mr\./g ;
$b =~ s/MRSFULLSTOP/Mrs\./g ;
$b =~ s/DRFULLSTOP/Dr\./g ;
$b =~ s/PROFFULLSTOP/Prof\./g ;
$b =~ s/PRFULLSTOP/Pr\./g ;
$b =~ s/SENFULLSTOP/Sen\./g ;
print $b ; 





