forked from External/mage
Add in modify_pdf.pl
Way to decode all the bits and pieces in a PDF. FlateDecode streams are decoded and then the <414243>Tj style bits of text are decoded. There are also (abc)Tj bits but I don't handle these. (41h=65='A')
This commit is contained in:
parent
ed85cf7111
commit
41f8a47529
1 changed files with 156 additions and 0 deletions
156
modify_pdf.pl
Normal file
156
modify_pdf.pl
Normal file
|
|
@ -0,0 +1,156 @@
|
|||
#!/usr/bin/perl
|
||||
|
||||
use 5.010;
|
||||
use Compress::Raw::Zlib;
|
||||
use Digest::MD5 qw(md5 md5_hex md5_base64);
|
||||
use File::Copy;
|
||||
use LWP::Simple;
|
||||
use MIME::Base64 qw(encode_base64url decode_base64url);
|
||||
use POSIX qw(strftime);
|
||||
use Socket;
|
||||
use bytes;
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
my ($infile, $outfile) = @ARGV;
|
||||
die "Usage: $0 INFILE OUTFILE\n" if not $outfile;
|
||||
|
||||
open my $in, '<', $infile or die;
|
||||
binmode $in;
|
||||
|
||||
my $cont = '';
|
||||
|
||||
while (1)
|
||||
{
|
||||
my $success = read $in, $cont, 100, length ($cont);
|
||||
die $! if not defined $success;
|
||||
last if not $success;
|
||||
}
|
||||
close $in;
|
||||
|
||||
open my $out, '>', $outfile or die;
|
||||
binmode $out;
|
||||
print $out $cont;
|
||||
close $out;
|
||||
|
||||
# Decode the Tj components of the streams
|
||||
sub hex_val
|
||||
{
|
||||
my $v = $_ [0];
|
||||
|
||||
if ($v =~ m/\d/) { return $v; }
|
||||
if ($v eq "a") { return 10; }
|
||||
if ($v eq "b") { return 11; }
|
||||
if ($v eq "c") { return 12; }
|
||||
if ($v eq "d") { return 13; }
|
||||
if ($v eq "e") { return 14; }
|
||||
if ($v eq "f") { return 15; }
|
||||
return $v;
|
||||
}
|
||||
|
||||
my $overall_text;
|
||||
my $tj;
|
||||
my $this_tj;
|
||||
sub print_str
|
||||
{
|
||||
my $str = $_ [0];
|
||||
my $orig_str = $str;
|
||||
my $this_s;
|
||||
|
||||
while ($str =~ s/(.)(.)//)
|
||||
{
|
||||
my $a = hex_val ($1) * 16;
|
||||
my $b = hex_val ($2);
|
||||
#print ($a+$b, " ");
|
||||
$this_s .= chr ($a+$b);
|
||||
}
|
||||
|
||||
if ($this_s =~ m/ +[A-Z]/)
|
||||
{
|
||||
$this_s .= " $this_tj$orig_str<<\n";
|
||||
$this_tj = " >> TJ=";
|
||||
}
|
||||
else
|
||||
{
|
||||
$this_tj .= "\n str:($this_s) $orig_str :";
|
||||
}
|
||||
return $this_s;
|
||||
}
|
||||
|
||||
sub get_pdf_text
|
||||
{
|
||||
my $text = $_ [0];
|
||||
# hh hh hh hh hh << hex based on two
|
||||
while ($text =~ s/^(.*)\n//im)
|
||||
{
|
||||
my $line = $1;
|
||||
if ($line =~ m/<([0-9a-f]+)>.*?Tj/)
|
||||
{
|
||||
my $str = $1;
|
||||
my $this_s = print_str ($str);
|
||||
$overall_text .= $this_s;
|
||||
}
|
||||
}
|
||||
}
|
||||
# Done - Decode the Tj components of the streams
|
||||
|
||||
# Write out the chunks of stream??
|
||||
my $keep = 1;
|
||||
my $cont2 = $cont;
|
||||
my $keep_cont2 = $cont2;
|
||||
|
||||
my $stream_r = qr/^.*?FlateDecode.*?[^d]stream/s;
|
||||
my $endstream_after_r = qr/endstream.*/s;
|
||||
my $endstream_before_r = qr/^.*?endstream/s;
|
||||
my $newline = qr/\r\n/s;
|
||||
my $o;
|
||||
|
||||
# MAIN
|
||||
while ($keep)
|
||||
{
|
||||
my $cont_two = 1;
|
||||
while ($cont2 =~ m/[^d]stream/im)
|
||||
{
|
||||
$cont2 =~ s/$stream_r//;
|
||||
$keep_cont2 = $cont2;
|
||||
print ("\n >>> " . length ($cont2));
|
||||
$cont2 =~ s/$endstream_after_r//;
|
||||
print ("\n 2>>> " . length ($cont2));
|
||||
$cont2 =~ s/$newline//img;
|
||||
$keep_cont2 =~ s/$endstream_before_r//;
|
||||
print ("\n 3>>> " . length ($keep_cont2));
|
||||
|
||||
# Compressed
|
||||
my $outfile2 = "perl_stream.$keep.zip";
|
||||
open my $out, '>', $outfile2 or die;
|
||||
binmode $out;
|
||||
print $out $cont2;
|
||||
close $out;
|
||||
|
||||
# Decompressed
|
||||
my $d = new Compress::Raw::Zlib::Inflate();
|
||||
my $output = $d->inflate ($cont2, $o);
|
||||
print " xxxx after inflate..>> $output \n";
|
||||
my $outfile2 = $outfile . ".$keep.txt";
|
||||
open my $out, '>', $outfile2 or die;
|
||||
binmode $out;
|
||||
print $out $o;
|
||||
close $out;
|
||||
|
||||
get_pdf_text ($o);
|
||||
|
||||
$keep++;
|
||||
$cont2 = $keep_cont2;
|
||||
}
|
||||
|
||||
print ("$overall_text\n");
|
||||
$keep = 0;
|
||||
}
|
||||
|
||||
$cont =~ s/\W/_/img;
|
||||
$cont =~ s/___*/_/img;
|
||||
#print $cont;
|
||||
|
||||
say length($cont);
|
||||
print ("$infile >> ", -s $infile, "\n");
|
||||
print ("$outfile >> ", -s $outfile, "\n");
|
||||
Loading…
Add table
Add a link
Reference in a new issue