#!/usr/bin/perl
use warnings;
use strict;
use Getopt::Long qw(:config bundling no_ignore_case);
use File::Find::Rule;
use File::Basename;
use Cwd ('abs_path');
use File::Which qw(which);
use DBI;
use open IO => ':encoding(utf8)';			# This is needed at least without more (en|de)codes. Otherwise 
use Encode qw(encode decode);				# utf8 plain texts don't end up in the db
use constant { TRUE          => 1,
	       FALSE         => 0,
	       DB_NAME       => 'pdf.sqlite',
	       DB_DIR	     => '.searchpdf',
	       PDFTOTEXT     => which('pdftotext'),
	       DB_CACHE      => 50_000,			# Adjust depending on the amount of pdfs
	       TABLE         => 'pdfs',
	       ID            => 'id',
	       FILE          => 'file',
	       LAST_MODIFIED => 'last_modified',
	       PLAIN_TEXT    => 'plain_text',
	       SIZE          => 'size',
	       UPDATED       => 'updated' };

# Constants above as variables for SQLite queries. TODO Is this redundant, only variables?
my $table 	  = TABLE;
my $id    	  = ID;
my $file  	  = FILE;
my $last_modified = LAST_MODIFIED;
my $plain_text    = PLAIN_TEXT;
my $size	  = SIZE;
my $updated	  = UPDATED;

my $pdftotext  = PDFTOTEXT;
my $cache_size = DB_CACHE;

# Define default values
my @directories = ();						# Config file may override.
my $matches;							# Infinite
my $recursive;							# Recurse indifinitely
my $database    = default_database();
my $query;
my $index	= FALSE;
my $verbose	= FALSE;
my $enc		= 'UTF-8';					# Encoding

# Handle arguments. TODO Are these executed in any order? It's a hash so undefined?
GetOptions ( 'help|h|?'      => \&help,
	     # Arguments which aren't directories or aren't readable are silently discarded
	     'folder|f=s'    => \&get_dirs,
	     'config|c=s'    => \&config_file,
	     'matches|m=i'   => \$matches,
	     'recursive|r=i' => \$recursive,
	     'database|d=s'  => \$database,
	     'query|q=s'     => \$query,
	     'index|i'       => sub { $index = TRUE; die("pdftotext not found, stopped") unless(defined PDFTOTEXT) },
	     'verbose|v'     => sub { $verbose = TRUE },
	     'encoding|e=s'  => \$enc ) || exit(1);


### Bad arguments START ################################################################################################

# Die if contradicting arguments
if($index && defined $query) { die("Contradicting arguments -i and -q passed, stopped") }
# Either index or query must be chosen
if(!$index && !defined $query) { die("Either -i or -q must be passed, stopped") }

### Bad arguments END ##################################################################################################

# Is db dir created already
db_dir_exists($database);

# Index TODO Too large, more subs.
if($index)
{
	# Db dir not writable. Sqlite might create temp files to dir.
	die("Database directory isn't writable, stopped") unless(-w (fileparse($database))[1]);
	# Connect to db.
	my $db_handle = connect_db();
	# Set cache size
	$db_handle->do("PRAGMA cache_size = $cache_size"); 
	# Db not writable.
	die("Database isn't writable, stopped") unless(-w $database);

	# Infer does a table exist in db or do we need to create one. Note, that there will be an error if the table has 
	# the same name, but the fields aren't what we want! We don't test the correctness of the table fully.
	unless( $db_handle->selectrow_array( qq{SELECT name
					        FROM sqlite_master
					        WHERE type='table' AND name='$table'} ) )
	{
		create_table($db_handle);
	} 

	# Find pdfs
	my @pdfs = enum_pdfs($recursive, @directories);
	# Save information about each pdf. Absolute dir, filename, modification time. TODO pass-by-ref for efficiency?
	# Even if redundant, doesn't take long to execute
	my @pdf_info = gather_info_pdfs(@pdfs);

	# INSERT loop for found pdfs TODO Can threads be used if only pdftotext is executed in childs and in the end
	# inserts and updates are done in parent? You can, supposedly, if you don't use db handles in childs.
	for my $pdf(@pdf_info)
	{
		# Query is the pdf in db already. If it isn't, empty list will be returned.
		my $stmt_handle = $db_handle->prepare( qq{SELECT $file, $last_modified
							  FROM $table
							  WHERE $file=?} );
		my @row = $db_handle->selectrow_array( $stmt_handle, {}, $pdf->{FILE} );
		
		# Pdf isn't in the db, INSERT it.
		if(!@row)
		{
			my $text = get_text($pdf->{FILE}, $enc);
			my $stmt_handle = $db_handle->prepare( qq{INSERT INTO $table
								  ($file, $last_modified, $plain_text, $size)
			   					   VALUES (?, ?, ?, ?)} );
			# TODO executing took lots of time, transactions?
			$stmt_handle->execute( ($pdf->{FILE}, $pdf->{LAST_MODIFIED}, $text, $pdf->{SIZE}) );
		}
		# Pdf is in the db
		else
		{
			# pdf has been modified since last insert, so we update it.
			if($pdf->{LAST_MODIFIED} > $row[1])
			{
				my $text = get_text($pdf->{FILE}, $enc);
				my $stmt_handle = $db_handle->prepare( qq{UPDATE $table
									  SET $last_modified=?, $plain_text=?, $size=?, 
									  $updated=?
                                                                      	  WHERE $file=?} );
	                        $stmt_handle->execute( ($pdf->{LAST_MODIFIED}, $text, $pdf->{SIZE}, 1, $pdf->{FILE} ) );
			}
			# pdf in the db and on fs is same. only set updated_flag
			else
			{
				$db_handle->do( qq{UPDATE $table
                                                   SET $updated=?
                                                   WHERE $file=?}, undef, (1, $pdf->{FILE}) );
			}
		}
	}


	# Preparing statement separately here saves massive amount in execution time!
	my $sth = $db_handle->prepare( qq{SELECT $file, $last_modified
                                          from $table
                                          WHERE $updated=?} );

	# UPDATE loop for the db. TODO Use transactions here
	for my $row ( @{$db_handle->selectall_arrayref( $sth, undef, (0) )} )
	{
		# Pdf isn't found on fs, so we delete that row 
		if(!-e $row->[0])
		{
			$db_handle->do( qq{DELETE from $table
					   WHERE $file=?}, undef, ($row->[0]) );
		}
		# Pdf is on fs
		else
		{
			# Pdf is older in db than on fs, so update db
			my @pdf_info = gather_info_pdfs($row->[0]);
			if($row->[1] < $pdf_info[0]->{LAST_MODIFIED})
			{
				my $text = get_text($row->[0], $enc);
				$db_handle->do( qq{UPDATE $table
						   SET $last_modified=?, $size=?, $plain_text=?
						   WHERE $file=?}, undef, ($pdf_info[0]->{LAST_MODIFIED}, 
									   $pdf_info[0]->{SIZE}, $text, $row->[0])); 
			}
		}
	}

	# Unset updated column for all. might be redundant? Slow!
	my $rows = $db_handle->do( qq{UPDATE $table
			   	      SET $updated=?
			   	      WHERE $updated=?}, undef, (0, 1) );

	$db_handle->disconnect();
}

# Query. regexp. use fts! first create virtual table and copy the orig. to it.
# create virtual table virtual_pdfs using fts3(file, size, plain_text);
# insert into virtual_pdf(file, size, plain_text) select file, size, plain_text from pdfs;
# or something..
elsif(defined $query)
{
	# If db doesn't exist, there's no reason to continue
	die("Database not found, stopped") unless(-f $database);
	# Connect to the db
	my $db_handle = connect_db();

	# Construct the query/regexp
	my ($const_query, $query_type) = construct_query(decode($enc, $query));

	# Query the db
	my $answers = query($const_query, $db_handle, $query_type, $verbose);
	
	print encode($enc, $answers);

	$db_handle->disconnect();
}

exit(0);


### SUBS ############################################################################################################

# Connect to db. Returns the database handle
sub connect_db
{
	my $dbh = DBI->connect("dbi:SQLite:dbname=" . $database, '', '', { RaiseError => 1, sqlite_unicode => 1 } )
                        or die($DBI::errstr);

	return $dbh;
}

### Subs for indexing START #########################################################################################
#
# Create table
sub create_table
{
	my $db_handle = shift;
	
	$db_handle->do( qq{CREATE TABLE $table
		           ( $id INTEGER PRIMARY KEY ASC,
		             $file TEXT UNIQUE,
		             $last_modified INT,
		             $plain_text TEXT NOT NULL,
		             $size INT,
		             $updated INT DEFAULT 1 )} );
}

# Find readable pdfs in all the wanted directories, considering recursion. Returns pdf file list.
sub enum_pdfs
{
	my ($recursion, @directories) = (shift, @_);
		
	my @files = File::Find::Rule->file()->readable()->name(qr/\.pdf$/i)->maxdepth($recursion)->in(@directories);
	return @files;
}

# Save information about each pdf. File with absolute path, modification time as reference of a hash in an array is
# returned.
sub gather_info_pdfs
{
	my @pdfs = @_;

	my @pdf_info;
	for my $pdf(@pdfs)
	{
		my %info 	     = ();
		$info{FILE} 	     = decode($enc, abs_path($pdf));	# Absolute dir + filename. Addind decoding here
									# seemed to be enough to pass correct file names
		$info{LAST_MODIFIED} = (stat($pdf))[9];			# More info needed?
		$info{SIZE} 	     = (stat(_))[7];			# Size of the pdf
		
		push(@pdf_info, \%info);
	}	
	
	return @pdf_info;
}

# Extract text from the pdf. XXX quotes added around $pdftotext
sub get_text
{
	my ($pdf, $enc) = @_;

	my $text = qx/"$pdftotext" -q -layout -enc $enc "$pdf" -/;
	
	return $text;
}

### Subs for indexing END ###########################################################################################

### Subs for querying START #########################################################################################
#
# Execute query. Takes already constructed query, the database handle, the type of the query and verbosity as 
# parameters. Returns a formatted result string.
sub query
{
	my ($query, $dbh, $type, $verbose) = @_;

	my ($result, $answers) = ('', '');
        if($verbose)
        {
                $answers = $dbh->selectall_arrayref( qq{SELECT $file, $plain_text
							from $table
							WHERE $plain_text $type ?}, { MaxRows => $matches }, $query );

                # Change SQL wildcards to similar perl ones. TODO LIKE, REGEXP?
                $query =~ tr/_%/../;

                for(@$answers)
                {
                        if($_->[1] =~ /(.{0,15}$query.{0,30})/)
                        {
                                $result .= $_->[0] . ': ' . $1 . "\n";
                        }
                        else
                        {
                                $result .= $_->[0] . ': ' . "\n";
                        }
                }
        }
        else
        {
                $answers = $dbh->selectall_arrayref( qq{SELECT $file
							FROM $table
							WHERE $plain_text $type ?}, { MaxRows => $matches }, $query );
                for(@$answers)
                {
                        $result .= $_->[0] . "\n";
                }
        }

        return $result;
}

# Constructs the query or regexp. Detects if regexp wanted. Returns the constructed query string and the type of query
# in SQL.
sub construct_query
{
	my $query = shift;

	# Is regexp
	if($query =~ /^\/(.+)\/$/)
	{
		return ($1, 'REGEXP');
	}
	# is a type like query
	else
	{
		return ($query, 'LIKE');
	}
}

### Subs for querying END ###########################################################################################

# Returns the database dir + filename. $ENV{HOME} returns absolute path.
sub default_database
{
        # Home is defined. *nix
        if(defined $ENV{HOME})
        {
                return $ENV{HOME}.'/'.DB_DIR.'/'.DB_NAME;
        }
	# Homepath defined. Win
	elsif(defined $ENV{HOMEPATH})
	{
		return $ENV{HOMEPATH}.'/'.DB_DIR.'/'.DB_NAME;
	}
	# HOME env variable not set.
        else
        {
                return DB_DIR.'/'.DB_NAME;
        }
}

# Creates db dir if doesn't exist.
sub db_dir_exists
{
	my $db = shift;

	my $db_dir = dirname($db);
	unless(-d $db_dir)
	{
		mkdir($db_dir) || die("Couldn't create database directory $db_dir: $!");
	}	
}

### Argument handling subs START ####################################################################################
#
# Help
sub help
{
	my $executable = (fileparse($0))[0];
	my ($dir, $file) = (DB_DIR, DB_NAME);
	
	print <<HELP;
$executable version 0.3.0-04-06-2011, by Jukka Laitinen.
Create SQLite databases out of PDF documents and search from them.
Usage:  $executable [OPTION]... -q QUERY
        $executable [OPTION]... -i

	-c, 	--config=FILE		Define directories in a FILE
	-d, 	--database=FILE		Database named FILE
	-e,	--encoding=STRING	Set encoding. Default is UTF-8
	-f,     --folder=DIR1,DIR2,...	Directories to search from. Empty by default
	-h, -?, --help			Prints this help
	-i,	--index			Index and update pdf documents
	-m, 	--matches=N		Search is discontinued after n matches found. Default is find all matches
	-q, 	--query=STRING		Query the database
	-r, 	--recursive=N		Recurse to directories N depth. Default is to recurse infinitely
	-v,	--verbose		Print a short part of the matching plain text
HELP

	exit(0);
}

# Get directories from a file. XXX @directories declared in main!
sub config_file
{
	# Get the config file
	my $config = $_[1];
	
	@directories = ();

	open(my $fh, $config) || die("Couldn't open the config file: $config");
	while(<$fh>)
	{
		chomp;				# A filename can't end to newline

		if(-d $_ && -r _ && $_ !~ /^#/) # Is directory and readable and not a comment.
		{
			push(@directories, $_);
		}
		elsif($_ !~ /^#/)
		{
			warn("The dir argument $_ isn't a directory or readable by effective user");	
		}
	}

	close($fh) || die("Couldn't close the config file: $config");
}

# Get directories and validate them. Warn if the argument isn't a directory or isn't readable by effective user. In that
# case the directory is also ignored, of course. XXX @directories declared in main!
sub get_dirs
{
	my $arg = $_[1];

	@directories = ();
	
	@directories = grep { eval { if(-d $_ && -r _) { 1; }
		                     else { warn("The dir argument $_ isn't a directory or readable by effective user");
					    0; }
				   } 
			    } split(/,/, $arg);
}

### Argument handling subs END ######################################################################################


=head1 NAME

pdfsearch.pl - Index and search PDF documents.

=head1 SYNOPSIS

$ ./pdfsearch.pl -f/ -i

$ ./pdfsearch.pl -d somepdfs.db -f dir1,dir2,dir3 -r 1 -i

$ ./pdfsearch.pl -d somepdfs.db -vm5 -q '%search term%'

=head1 DESCRIPTION

Creates a database out of PDF documents, first finding them from specified directories, extracting text
out of them and saving the plain text to a database. After the initial index creation the database can be queried.

=head2 OPTIONS

=over 4

=item -c,     --config=I<FILE>	

Define directories in a file, one directory per line. Line starting with '#' is considered as a comment.

=item -d,     --database=I<FILE>

If the database exists already, it is never truncated.
Default for *NIX systems is $HOME/.searchpdf/pdf.sqlite and %HOMEPATH%\.searchpdf\pdf.sqlite for Windows.

=item -e,     --encoding=I<STRING>

Depending the encoding of the PDF texts, you might want to change this. Default is UTF-8. Affects both the -i and -q
switch.

=item -f,     --folder=I<DIR>,I<DIR>,...

Directories to search from, separated by commas. Therefore, directories can't include commas.

=item -h, -?, --help

=item -i,     --index

Creates and updates the database.
If you want to only update the database, not to find new PDF files, you can pass only the -i switch:

$ ./pdfsearch.pl -i

=item -m,     --matches=I<N>

Querying the database is discontinued after N matches found.

=item -q,     --query=I<'STRING'>

There are to ways two query the database. If argument is surrounded by slashes, Perl regular expression can be used.
For example, case-insensitive search:

$ ./pdfsearch.pl -q'/(i?:CaSe DoEs NoT MaTtEr)/'

Slashes only notifies this program to use SQLite's REGEXP function - which is implemented in DBD::SQLite - they are
removed from the query.

If slashes are not used, SQL's LIKE operator is used. The usual SQL wildcards can be used, '%' for zero or more
characters and '_' for exactly one character. This type of query is case-insensitive.
Plain texts are inserted into the database as an independent strings, so it's always good to surround the query by '%'
wildcards.
If the query contains whitespace e.g., the query have to be quoted or escaped properly. Double quotes in Windows.

=item -r,     --recursive=I<N>

Recurse to directories only N levels deep. Default is infinitely.

=item -v,     --verbose

Prints a short part of the matching text.
Note, that while the LIKE operator is case-insensitive, regular expressions in this case aren't - it would make
matching very slow - so part of the matching text isn't always printed.

=back

=head1 FILES

I<~/.searchpdf/pdf.sqlite>

=head1 AUTHOR

Jukka Laitinen

=cut

