Add in modify_pdf.pl

Way to decode all the bits and pieces in a PDF.  FlateDecode streams are decoded and then the   <414243>Tj style bits of text are decoded.  There are also (abc)Tj bits but I don't handle these.   (41h=65='A')
This commit is contained in:
spjspj 2025-02-23 22:46:39 +11:00 committed by GitHub
parent ed85cf7111
commit 41f8a47529
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

156
modify_pdf.pl Normal file
View file

@ -0,0 +1,156 @@
#!/usr/bin/perl
use 5.010;
use Compress::Raw::Zlib;
use Digest::MD5 qw(md5 md5_hex md5_base64);
use File::Copy;
use LWP::Simple;
use MIME::Base64 qw(encode_base64url decode_base64url);
use POSIX qw(strftime);
use Socket;
use bytes;
use strict;
use warnings;
my ($infile, $outfile) = @ARGV;
die "Usage: $0 INFILE OUTFILE\n" if not $outfile;
open my $in, '<', $infile or die;
binmode $in;
my $cont = '';
while (1)
{
my $success = read $in, $cont, 100, length ($cont);
die $! if not defined $success;
last if not $success;
}
close $in;
open my $out, '>', $outfile or die;
binmode $out;
print $out $cont;
close $out;
# Decode the Tj components of the streams
sub hex_val
{
my $v = $_ [0];
if ($v =~ m/\d/) { return $v; }
if ($v eq "a") { return 10; }
if ($v eq "b") { return 11; }
if ($v eq "c") { return 12; }
if ($v eq "d") { return 13; }
if ($v eq "e") { return 14; }
if ($v eq "f") { return 15; }
return $v;
}
my $overall_text;
my $tj;
my $this_tj;
sub print_str
{
my $str = $_ [0];
my $orig_str = $str;
my $this_s;
while ($str =~ s/(.)(.)//)
{
my $a = hex_val ($1) * 16;
my $b = hex_val ($2);
#print ($a+$b, " ");
$this_s .= chr ($a+$b);
}
if ($this_s =~ m/ +[A-Z]/)
{
$this_s .= " $this_tj$orig_str<<\n";
$this_tj = " >> TJ=";
}
else
{
$this_tj .= "\n str:($this_s) $orig_str :";
}
return $this_s;
}
sub get_pdf_text
{
my $text = $_ [0];
# hh hh hh hh hh << hex based on two
while ($text =~ s/^(.*)\n//im)
{
my $line = $1;
if ($line =~ m/<([0-9a-f]+)>.*?Tj/)
{
my $str = $1;
my $this_s = print_str ($str);
$overall_text .= $this_s;
}
}
}
# Done - Decode the Tj components of the streams
# Write out the chunks of stream??
my $keep = 1;
my $cont2 = $cont;
my $keep_cont2 = $cont2;
my $stream_r = qr/^.*?FlateDecode.*?[^d]stream/s;
my $endstream_after_r = qr/endstream.*/s;
my $endstream_before_r = qr/^.*?endstream/s;
my $newline = qr/\r\n/s;
my $o;
# MAIN
while ($keep)
{
my $cont_two = 1;
while ($cont2 =~ m/[^d]stream/im)
{
$cont2 =~ s/$stream_r//;
$keep_cont2 = $cont2;
print ("\n >>> " . length ($cont2));
$cont2 =~ s/$endstream_after_r//;
print ("\n 2>>> " . length ($cont2));
$cont2 =~ s/$newline//img;
$keep_cont2 =~ s/$endstream_before_r//;
print ("\n 3>>> " . length ($keep_cont2));
# Compressed
my $outfile2 = "perl_stream.$keep.zip";
open my $out, '>', $outfile2 or die;
binmode $out;
print $out $cont2;
close $out;
# Decompressed
my $d = new Compress::Raw::Zlib::Inflate();
my $output = $d->inflate ($cont2, $o);
print " xxxx after inflate..>> $output \n";
my $outfile2 = $outfile . ".$keep.txt";
open my $out, '>', $outfile2 or die;
binmode $out;
print $out $o;
close $out;
get_pdf_text ($o);
$keep++;
$cont2 = $keep_cont2;
}
print ("$overall_text\n");
$keep = 0;
}
$cont =~ s/\W/_/img;
$cont =~ s/___*/_/img;
#print $cont;
say length($cont);
print ("$infile >> ", -s $infile, "\n");
print ("$outfile >> ", -s $outfile, "\n");