#!/usr/local/bin/perl -w
# corpus.p k=1
#
# uses glimpse and a dictionary to select lines from 
#   /data/coll/mackay/books/*
#
# writes its output to "o", eg corpus.txt in the current directory
# rm  /data/coll/mackay/books/*~ ;  glimpseindex -b  -B   -H ~/dasher/  /data/coll/mackay/books/
# corpus.p k=1 f=13
# corpus.p k=1 f=4

# see corpus

# options
$period = 5; # How many sentences before a paragraph. (i.e. new line)
$paraperiod = 25; # how many sentences before a \n\n
$f = 4 ;
$o = "corpus.txt" ;
$v=0; # whether to put verbose comments in the corpus itself
$dict = "/home/mackay/dasher/dict" ;
$dir = "/data/coll/mackay/books/" ;
#

# $glimpsecommand = "glimpse  -L 10000:100:100 -y -n -w -i -O -h -H ~/dasher/ ";
#$glimpsecommand = "glimpse   -L 10000:100:100  -y -n -w -i -O -h -H ~/dasher/ ";
$glimpsecommand1 = "glimpse ";
$glimpsecommand2 = "  -y -n -w -i -O -h -H ~/dasher/ ";
# -h suppresses filename
# -L limits to 100 matches per file

$addme = "additions" ; # file to cat onto the end of O
$verbose = 0 ;
$k = 1 ;  # required number of occurences of each word to grab from each document, if possible.
$Lstring = "  10000:100:100 " ;
# all these should exceed k ( I deliberately find more than just k in order to
# scan around the documents a bit and get variety.
$Lstring = "  30:30:30 " ;
eval "\$$1=\$2" while @ARGV && $ARGV[0]=~ /^(\w+)=(.*)/ && shift;

if ($f==15) {
    # all files:
    @filelist = ('KoreanEnglish' , 'alice', 'emma' , 'hawking' , 'molerat', 'phrases', 'snark', 'tomswift', 'codingtheory', 'gltrv', 'khan', 'oz', 'sbrun', 'starwars' , 'Brown' ) ;
} elsif ($f==13) {
# omit khan:
    @filelist = ('alice', 'emma' , 'hawking' , 'molerat', 'phrases', 'snark', 'tomswift', 'codingtheory', 'gltrv', 'oz', 'sbrun', 'starwars' , 'Brown' ) ; 
} elsif ($f==6) {
# just the simple english ones, and only big files.
    @filelist = ('alice', 'hawking' , 'oz', 'sbrun', 'starwars' , 'Brown' ) ; 
# corpus.p k=1 o=corpus6.txt
} elsif ($f ==2) {
# Minimal set, maybe?
    @filelist = ( 'hawking' , 'Brown' ) ; 
# corpus.p k=1 f=2 o=corpus2.txt
# wc  corpus2.txt1
#  300K.
} elsif   ( $f==4 ) {
    @filelist = ( 'alice' ,   'hawking' , 'Brown' ,    'phrases' ) ;
} else { # $f==5
    @filelist = ( 'alice' ,   'hawking' , 'Brown' ,    'phrases' , 'additions' ) ;
}

$L = 0 ; # number of lines
open (DI,"< $dict ") ;
$output="$o$k" ;
if ($additions eq "" ) {
    open ( O , ">$output" ) ;
} else {
    print STDERR "executing     cat $dir$additions > $output\n" ;
    system("cat $dir$additions > $output" ) ;
    open ( O , ">>$output" ) ;
}
$numwords=0;$notfound=0;
DILOOP:
while(<DI>) {
    # remove initial whitespace
    s/^\s+// ;
    next DILOOP if (/^\#/) ; # skip comments
#    next DILOOP if (/\'/) ; # skip apostrophe-ful lines
    
    ($word,$rest) = split ;
    last DILOOP     if ( $word eq "" );
    next DILOOP if ($worddone{$word}) ;
    $worddone{$word} = 1 ;
    print "$word\n" if ($verbose>-1) ;
    $numwords ++ ;
    $thiswordmatches = 0 ;
    $words= $word ;
    $words =~ s#\'#\\\'#g ; # $words is the actual word-search-pattern
    foreach $f ( @filelist ) {
	print $f,"\n" if ($verbose>1) ;
	$GC = "$glimpsecommand1 -L $Lstring $glimpsecommand2 -F $f $words" ;
	print $GC."\n" if ($verbose>0) ;
	open ( G, "$GC |" ) ;
	# CLEAR OUT thelist
	 undef(@thelines) ;	 undef(@thelist) ; $alreadyin = 0 ; 
	while (<G>) {
	    ($n,$rest ) = split ( ":" ) ;
	    # remove the line number
	    s/$n\:// ;
	    if ( !$hits{$f,$n} ) {
		push ( @thelist , $n ) ;
		push ( @thelines , $_ ) ; 
	    } else {
		$hits{$f,$n} += 1 ;
		$alreadyin ++ ; # the word has already appeared in the corpus
	    }
	}
	print "matches for $word in $f: lines ".join(":",@thelist)."\n" if ($verbose>0) ;
#	print "number of matches is ".$#thelist+1."\n" ; 
	while (($alreadyin < $k) && ($#thelist+1>0) ) {
	    # @thelist contains a list of lines that have the word. We should add some of them
	    $q = int ( rand($#thelist+1) ) ;
	    $n = splice ( @thelist , $q , 1  ) ;
	    $_ = splice ( @thelines , $q , 1  ) ;
	    $hits{$f,$n} = 1 ;
	    $alreadyin ++ ;

	    # actions to take before writing...
	    s/^\s+// ; # remove initial spaces
	    # Remove the newline from 2 of every 3 lines
	    # almost always remove the new line
	    $L ++ ;
	    if ( ($L%$period) ) {
		s/\n/ /; 
	    } else {
		# print extra \n occasionally  ($paraperiod should be multiple of period)
		if ( !($L%$paraperiod) ) { print O "\n" ;}
	    }
	    s/[ \t][ \t]+/ /g  ; # reduce multiple spaces to 1
	    if ($v) { print O " [$word,$f:$alreadyin/$k] "; }
	    print O  ;
	}
	$thiswordmatches += $alreadyin ;
    }
    if ($thiswordmatches==0 ) {
	$notfound ++ ;
	print STDERR " $word NOT FOUND\n" ; 
	push ( @nflist , $word ) ; 
    }
}
if($notfound) {
    print  "DONE\n not found:\n" ;
    print  join("\n",@nflist);
}
print O "\n\n" ;
close(O);
print STDERR "Wrote $L lines to $output with parameter $k\n" ;
print STDERR "Of $numwords requested, $notfound were not found\n" ;





    

