# /lib/MyApp/Plugin/OCR.pm package MyApp::Plugin::OCR; use Mojo::Base 'Mojolicious::Plugin'; use File::Temp qw(tempfile); use Mojo::Util qw(trim); use strict; use warnings; use utf8; # Optical Character Recognition and Receipt Parsing Service Plugin. # Features: # - Image pre-processing via ImageMagick (Grayscale, Deskew, Threshold) # - Text extraction via Tesseract OCR # - Heuristic parsing for Store Name, Date, and Total Amount # # Integration points: # - Registers global helper '$c->ocr_process($blob)' in Mojolicious. # - Relies on external 'convert' and 'tesseract' binaries. # - Logs process errors to the application logger. sub register { my ($self, $app, $config) = @_; $app->helper(ocr_process => sub { my ($c, $image_data) = @_; # 1. Create temporary files for processing my ($fh_in, $fname_in) = tempfile(UNLINK => 1); my ($fh_out, $fname_out) = tempfile(SUFFIX => '.jpg', UNLINK => 1); binmode($fh_in); print $fh_in $image_data; close($fh_in); # 2. Pre-process and OCR with primary method (Sharpened) my $ocr_base = $fname_out; $ocr_base =~ s/\.jpg$//; my $data = $self->_execute_ocr($c, $fname_in, $fname_out, $ocr_base, ["-colorspace", "gray", "-unsharp", "0x5+1.0+0.05", "-deskew", "40%", "-threshold", "50%"]); # 3. Fallback: If critical data is missing, try secondary method (Resized) if (!$data->{receipt_date} || !$data->{total_amount}) { my $fallback_data = $self->_execute_ocr($c, $fname_in, $fname_out, $ocr_base, ["-resize", "200%", "-colorspace", "gray", "-deskew", "40%", "-threshold", "50%"]); # Merge results: only fill in missing fields $data->{store_name} ||= $fallback_data->{store_name}; $data->{receipt_date} ||= $fallback_data->{receipt_date}; $data->{total_amount} ||= $fallback_data->{total_amount}; $data->{raw_text} .= " --- FALLBACK OCR OUTPUT --- " . $fallback_data->{raw_text}; } return $data; }); } sub _execute_ocr { my ($self, $c, $fname_in, $fname_out, $ocr_base, $flags) = @_; # Run ImageMagick if (system("convert", $fname_in, @$flags, "jpg:$fname_out") != 0) { $c->app->log->error("OCR: ImageMagick 'convert' failed: $!"); } # Run Tesseract if (system("tesseract", $fname_out, $ocr_base, "--psm", "6", "--oem", "1", "quiet") != 0) { $c->app->log->error("OCR: Tesseract execution failed: $!"); } my $txt_file = $ocr_base . ".txt"; my $raw_text = ""; if (-f $txt_file) { open my $fh, "<:utf8", $txt_file; $raw_text = do { local $/; <$fh> }; close $fh; unlink($txt_file); } my $data = $self->parse_text($raw_text); $data->{raw_text} = $raw_text; return $data; } sub parse_text { my ($self, $text) = @_; my @lines = grep { trim($_) ne '' } split(/ /, $text); my $data = { store_name => undef, receipt_date => undef, total_amount => undef }; return $data unless @lines; # --- Store Name Heuristics --- my %store_map = ( 'ALDI' => 'ALDI', 'COLES' => 'Coles', 'WOOLWORTHS' => 'Woolworths', 'worths' => 'Woolworths', 'W00LW0RTHS' => 'Woolworths', 'IGA' => 'IGA', 'FOODWORKS' => 'Foodworks', 'KMART' => 'Kmart', 'TARGET' => 'Target', 'BIG W' => 'Big W', 'BUNNINGS' => 'Bunnings', '7-ELEVEN' => '7-Eleven', '7 ELEVEN' => '7-Eleven', 'AMPOL' => 'Ampol', 'CALTEX' => 'Ampol', 'JB HI-FI' => 'JB Hi-Fi', 'CHEMIST WAREHOUSE'=> 'Chemist Warehouse', 'DAN MURPHY' => "Dan Murphy's", ); STORE_SEARCH: for my $line (@lines) { my $clean_line = uc(trim($line)); foreach my $key (sort { length($b) <=> length($a) } keys %store_map) { if ($clean_line =~ /\b$key\b/i || $clean_line =~ /^$key/i) { $data->{store_name} = $store_map{$key}; last STORE_SEARCH; } } } # --- Total Amount Heuristics --- my @amounts; while ($text =~ /(?:TOTAL|AMOUNT|BAL|DUE|AUD|Card Sales|EFT|SUBTOTAL|PAYABLE|PAID)\s*[:\$]*\s*([0-9]{1,4}[\.,][0-9]{2})/gi) { push @amounts, $1; } if (@amounts) { my @sorted_amounts = sort { $b <=> $a } map { s/,/./; $_ } @amounts; $data->{total_amount} = $sorted_amounts[0]; } # --- Date Heuristics --- my @dates; while ($text =~ m|\b(\d{1,2})[/\.-](\d{1,2})[/\.-](\d{2,4})\b|g) { my ($d, $m, $y) = ($1, $2, $3); next if $m > 12 || $d > 31; $y = "20$y" if length($y) == 2; push @dates, sprintf("%04d-%02d-%02d", $y, $m, $d); } if (@dates) { require Time::Piece; my $now = Time::Piece->new; my $today = $now->strftime('%Y-%m-%d'); my $oldest = sprintf('%04d-%02d-%02d', $now->year - 10, $now->mon, $now->mday); my $is_plausible = sub { my $str = shift; return 0 if $str gt $today || $str lt $oldest; eval { Time::Piece->strptime($str, '%Y-%m-%d') }; return $@ ? 0 : 1; }; ($data->{receipt_date}) = grep { $is_plausible->($_) } @dates; } return $data; } 1;