diff --git a/modify_pdf.pl b/modify_pdf.pl new file mode 100644 index 00000000000..978fb2b0449 --- /dev/null +++ b/modify_pdf.pl @@ -0,0 +1,156 @@ +#!/usr/bin/perl + +use 5.010; +use Compress::Raw::Zlib; +use Digest::MD5 qw(md5 md5_hex md5_base64); +use File::Copy; +use LWP::Simple; +use MIME::Base64 qw(encode_base64url decode_base64url); +use POSIX qw(strftime); +use Socket; +use bytes; +use strict; +use warnings; + +my ($infile, $outfile) = @ARGV; +die "Usage: $0 INFILE OUTFILE\n" if not $outfile; + +open my $in, '<', $infile or die; +binmode $in; + +my $cont = ''; + +while (1) +{ + my $success = read $in, $cont, 100, length ($cont); + die $! if not defined $success; + last if not $success; +} +close $in; + +open my $out, '>', $outfile or die; +binmode $out; +print $out $cont; +close $out; + +# Decode the Tj components of the streams +sub hex_val +{ + my $v = $_ [0]; + + if ($v =~ m/\d/) { return $v; } + if ($v eq "a") { return 10; } + if ($v eq "b") { return 11; } + if ($v eq "c") { return 12; } + if ($v eq "d") { return 13; } + if ($v eq "e") { return 14; } + if ($v eq "f") { return 15; } + return $v; +} + +my $overall_text; +my $tj; +my $this_tj; +sub print_str +{ + my $str = $_ [0]; + my $orig_str = $str; + my $this_s; + + while ($str =~ s/(.)(.)//) + { + my $a = hex_val ($1) * 16; + my $b = hex_val ($2); + #print ($a+$b, " "); + $this_s .= chr ($a+$b); + } + + if ($this_s =~ m/ +[A-Z]/) + { + $this_s .= " $this_tj$orig_str<<\n"; + $this_tj = " >> TJ="; + } + else + { + $this_tj .= "\n str:($this_s) $orig_str :"; + } + return $this_s; +} + +sub get_pdf_text +{ + my $text = $_ [0]; + # hh hh hh hh hh << hex based on two + while ($text =~ s/^(.*)\n//im) + { + my $line = $1; + if ($line =~ m/<([0-9a-f]+)>.*?Tj/) + { + my $str = $1; + my $this_s = print_str ($str); + $overall_text .= $this_s; + } + } +} +# Done - Decode the Tj components of the streams + +# Write out the chunks of stream?? +my $keep = 1; +my $cont2 = $cont; +my $keep_cont2 = $cont2; + +my $stream_r = qr/^.*?FlateDecode.*?[^d]stream/s; +my $endstream_after_r = qr/endstream.*/s; +my $endstream_before_r = qr/^.*?endstream/s; +my $newline = qr/\r\n/s; +my $o; + +# MAIN +while ($keep) +{ + my $cont_two = 1; + while ($cont2 =~ m/[^d]stream/im) + { + $cont2 =~ s/$stream_r//; + $keep_cont2 = $cont2; + print ("\n >>> " . length ($cont2)); + $cont2 =~ s/$endstream_after_r//; + print ("\n 2>>> " . length ($cont2)); + $cont2 =~ s/$newline//img; + $keep_cont2 =~ s/$endstream_before_r//; + print ("\n 3>>> " . length ($keep_cont2)); + + # Compressed + my $outfile2 = "perl_stream.$keep.zip"; + open my $out, '>', $outfile2 or die; + binmode $out; + print $out $cont2; + close $out; + + # Decompressed + my $d = new Compress::Raw::Zlib::Inflate(); + my $output = $d->inflate ($cont2, $o); + print " xxxx after inflate..>> $output \n"; + my $outfile2 = $outfile . ".$keep.txt"; + open my $out, '>', $outfile2 or die; + binmode $out; + print $out $o; + close $out; + + get_pdf_text ($o); + + $keep++; + $cont2 = $keep_cont2; + } + + print ("$overall_text\n"); + $keep = 0; +} + +$cont =~ s/\W/_/img; +$cont =~ s/___*/_/img; +#print $cont; + +say length($cont); +print ("$infile >> ", -s $infile, "\n"); +print ("$outfile >> ", -s $outfile, "\n");