#!/usr/bin/perl -w

# script to parse stupid one time format:
# name of the index file is also in the folder with files to match
# name of the pdf document, without the .pdf
# [tab]	0-many description lines
# [empty line is a separator]
# attention: multiple characters break this script: )(,/

use strict;
use JSON;

my %index;	# box number -> pdf document -> description
my %boxes = (
#	'1.txt' => '1 - Gordan Izpiti',
#	'2.txt' => '2 - razno scherber',
#	'3.txt' => '3 - Milena Kosec',
#	'4.txt' => '4 - SOFTY software Jure Longyka',
#	'6.txt' => '6 - Gortan 1',
#	'7.txt' => '7 - Gortan 2',
#	'8.txt' => '8 - Gortan, raziskave, poročila',
	'9.txt' => '9 - IDC'
);

open (BLA, ">index.csv");

# parse the inputs
foreach my $index (readpipe("ls -1 *.txt")) {
	chop $index;
	my $box = $boxes{$index};
	my $title = "";
	my $type= "";
	my $year = 0;
	my $description = "";
	
	next if !$box;

	foreach my $line (readpipe("cat $index")) {
		chop $line;

		if (!$line or $line eq "") { # empty lines are separators between blocks
			my $filename = "$box/$title.pdf";
			$index{$box}{$title} = $description;
			if (!-f $filename) { warn "no file $filename"; } 

		} elsif ($line =~ /^\t(.+?)$/) {	# description lines start with tab
			$description .= $1."\n";
			$index{$box}{$title} = $description;

		} elsif ($line =~ /^(.+)$/) {	# next title, rotate
			($title, $type, $year) = split/\t/,$1;
			$description = "";
			$index{$box}{$title} = $description;
			
			print BLA join("\t",$box, $title, $type, $year, "$box/$title.pdf", "$box/$title/00000001.jpg")."\n";
		}
	}
}
close BLA;

# generate outputs if they dont exist yet
foreach my $box (keys(%index)) {
	foreach my $title (keys(%{$index{$box}})) {
		my $pdf = "$box/$title.pdf";
		my $folder = "$box/$title";

# create folder for every pdf and split into pages
		if (!-d $folder) {
			system ("mkdir '$folder'");
			#system ("pdftk '$pdf' output '$folder/'");
			system ("./cpdf '$pdf' -split -chunk 1 -o '$folder/%%%%%%%%.pdf'");
		}

# print out description files		
		if (!-f "$box/$title.description") {
			open (OUT, ">$box/$title.description");
			print OUT $index{$box}{$title};
			close OUT;
		}

# convert pages into jpgs		
		if (!-f "$folder/00000001.jpg") {
                foreach my $doc (readpipe("ls -1 '$folder/'*.pdf")) {
                	chop $doc;
                    if ($doc =~ /(.*\.\w+)/) {   
                        my $filename = $1;
                        warn "convert -colorspace RGB -interlace none -density 300x300 -quality 80 '$doc' '$filename.jpg'" if !-f "$filename.jpg";
                        system("convert -colorspace RGB -interlace none -density 300x300 -quality 80 '$doc' '$filename.jpg'") if !-f "$filename.jpg";
                    }
                }
         }
	}
}

warn "now run ocr.pl";
warn "and then manage.py import_docs revije/index.csv";

# next and final step: run ocr.pl
# then just manage.py import_docs revije/index.csv
