starting implementation of index pdf contents using pdf2text class #245
This commit is contained in:
parent
954c5b3e2f
commit
2e2b6f93f3
|
@ -228,4 +228,30 @@ class Media {
|
|||
throw new \Exception("fatal error");
|
||||
}
|
||||
|
||||
public index_pdf_content($file, $item_id) {
|
||||
|
||||
if ( ! \file_exists($file) ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Allow plugins to implement other approach to index pdf contents
|
||||
$alternate = apply_filters('tainacan-index-pdf', null, $file, $item_id);
|
||||
if ( ! \is_null($alternate) ) {
|
||||
return $alternate;
|
||||
}
|
||||
|
||||
require_once( TAINACAN_CLASSES_DIR . '/lib/class-pdf2text.php' );
|
||||
|
||||
$PDF2Text = new PDF2Text();
|
||||
$PDF2Text->setFilename($file);
|
||||
|
||||
try {
|
||||
$PDF2Text->decodePDF();
|
||||
update_post_meta( $item_id, '_pdf_index', $PDF2Text->output() );
|
||||
} catch($e) {
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,377 @@
|
|||
<?php
|
||||
/*
|
||||
|
||||
Source: https://gist.github.com/getive/78b62d42cebd79b5dfd6
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
This code is an improved version of what can be found at:
|
||||
http://www.webcheatsheet.com/php/reading_clean_text_from_pdf.php
|
||||
|
||||
AUTHOR:
|
||||
- Webcheatsheet.com (Original code)
|
||||
- Joeri Stegeman (joeri210 [at] yahoo [dot] com) (Class conversion and fixes/adjustments)
|
||||
|
||||
DESCRIPTION:
|
||||
This is a class to convert PDF files into ASCII text or so called PDF text extraction.
|
||||
It will ignore anything that is not addressed as text within the PDF and any layout.
|
||||
Currently supported filters are: ASCIIHexDecode, ASCII85Decode, FlateDecode
|
||||
|
||||
PURPOSE(S):
|
||||
Most likely for people that want their PDF to be searchable.
|
||||
|
||||
SYNTAX:
|
||||
include('class.pdf2text.php');
|
||||
$a = new PDF2Text();
|
||||
$a->setFilename('test.pdf');
|
||||
$a->decodePDF();
|
||||
echo $a->output();
|
||||
|
||||
ALTERNATIVES:
|
||||
Other excellent options to search within a PDF:
|
||||
- Apache PDFbox (http://pdfbox.apache.org/). An open source Java solution
|
||||
- pdflib TET (http://www.pdflib.com/products/tet/)
|
||||
- Online converter: http://snowtide.com/PDFTextStream
|
||||
*/
|
||||
|
||||
|
||||
class PDF2Text {
|
||||
// Some settings
|
||||
var $multibyte = 2; // Use setUnicode(TRUE|FALSE)
|
||||
var $convertquotes = ENT_QUOTES; // ENT_COMPAT (double-quotes), ENT_QUOTES (Both), ENT_NOQUOTES (None)
|
||||
|
||||
// Variables
|
||||
var $filename = '';
|
||||
var $decodedtext = '';
|
||||
|
||||
function setFilename($filename) {
|
||||
// Reset
|
||||
$this->decodedtext = '';
|
||||
$this->filename = $filename;
|
||||
}
|
||||
|
||||
function output($echo = false) {
|
||||
if($echo) echo $this->decodedtext;
|
||||
else return $this->decodedtext;
|
||||
}
|
||||
|
||||
function setUnicode($input) {
|
||||
// 4 for unicode. But 2 should work in most cases just fine
|
||||
if($input == true) $this->multibyte = 4;
|
||||
else $this->multibyte = 2;
|
||||
}
|
||||
|
||||
function decodePDF() {
|
||||
// Read the data from pdf file
|
||||
$infile = @file_get_contents($this->filename, FILE_BINARY);
|
||||
if (empty($infile))
|
||||
return "";
|
||||
|
||||
// Get all text data.
|
||||
$transformations = array();
|
||||
$texts = array();
|
||||
|
||||
// Get the list of all objects.
|
||||
preg_match_all("#obj[\n|\r](.*)endobj[\n|\r]#ismU", $infile, $objects);
|
||||
$objects = @$objects[1];
|
||||
|
||||
// Select objects with streams.
|
||||
for ($i = 0; $i < count($objects); $i++) {
|
||||
$currentObject = $objects[$i];
|
||||
|
||||
// Check if an object includes data stream.
|
||||
if (preg_match("#stream[\n|\r](.*)endstream[\n|\r]#ismU", $currentObject, $stream)) {
|
||||
$stream = ltrim($stream[1]);
|
||||
|
||||
// Check object parameters and look for text data.
|
||||
$options = $this->getObjectOptions($currentObject);
|
||||
|
||||
if (!(empty($options["Length1"]) && empty($options["Type"]) && empty($options["Subtype"])))
|
||||
continue;
|
||||
|
||||
// Hack, length doesnt always seem to be correct
|
||||
unset($options["Length"]);
|
||||
|
||||
// So, we have text data. Decode it.
|
||||
$data = $this->getDecodedStream($stream, $options);
|
||||
|
||||
if (strlen($data)) {
|
||||
if (preg_match_all("#BT[\n|\r](.*)ET[\n|\r]#ismU", $data, $textContainers)) {
|
||||
$textContainers = @$textContainers[1];
|
||||
$this->getDirtyTexts($texts, $textContainers);
|
||||
} else
|
||||
$this->getCharTransformations($transformations, $data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Analyze text blocks taking into account character transformations and return results.
|
||||
$this->decodedtext = $this->getTextUsingTransformations($texts, $transformations);
|
||||
}
|
||||
|
||||
|
||||
function decodeAsciiHex($input) {
|
||||
$output = "";
|
||||
|
||||
$isOdd = true;
|
||||
$isComment = false;
|
||||
|
||||
for($i = 0, $codeHigh = -1; $i < strlen($input) && $input[$i] != '>'; $i++) {
|
||||
$c = $input[$i];
|
||||
|
||||
if($isComment) {
|
||||
if ($c == '\r' || $c == '\n')
|
||||
$isComment = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
switch($c) {
|
||||
case '\0': case '\t': case '\r': case '\f': case '\n': case ' ': break;
|
||||
case '%':
|
||||
$isComment = true;
|
||||
break;
|
||||
|
||||
default:
|
||||
$code = hexdec($c);
|
||||
if($code === 0 && $c != '0')
|
||||
return "";
|
||||
|
||||
if($isOdd)
|
||||
$codeHigh = $code;
|
||||
else
|
||||
$output .= chr($codeHigh * 16 + $code);
|
||||
|
||||
$isOdd = !$isOdd;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if($input[$i] != '>')
|
||||
return "";
|
||||
|
||||
if($isOdd)
|
||||
$output .= chr($codeHigh * 16);
|
||||
|
||||
return $output;
|
||||
}
|
||||
|
||||
function decodeAscii85($input) {
|
||||
$output = "";
|
||||
|
||||
$isComment = false;
|
||||
$ords = array();
|
||||
|
||||
for($i = 0, $state = 0; $i < strlen($input) && $input[$i] != '~'; $i++) {
|
||||
$c = $input[$i];
|
||||
|
||||
if($isComment) {
|
||||
if ($c == '\r' || $c == '\n')
|
||||
$isComment = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($c == '\0' || $c == '\t' || $c == '\r' || $c == '\f' || $c == '\n' || $c == ' ')
|
||||
continue;
|
||||
if ($c == '%') {
|
||||
$isComment = true;
|
||||
continue;
|
||||
}
|
||||
if ($c == 'z' && $state === 0) {
|
||||
$output .= str_repeat(chr(0), 4);
|
||||
continue;
|
||||
}
|
||||
if ($c < '!' || $c > 'u')
|
||||
return "";
|
||||
|
||||
$code = ord($input[$i]) & 0xff;
|
||||
$ords[$state++] = $code - ord('!');
|
||||
|
||||
if ($state == 5) {
|
||||
$state = 0;
|
||||
for ($sum = 0, $j = 0; $j < 5; $j++)
|
||||
$sum = $sum * 85 + $ords[$j];
|
||||
for ($j = 3; $j >= 0; $j--)
|
||||
$output .= chr($sum >> ($j * 8));
|
||||
}
|
||||
}
|
||||
if ($state === 1)
|
||||
return "";
|
||||
elseif ($state > 1) {
|
||||
for ($i = 0, $sum = 0; $i < $state; $i++)
|
||||
$sum += ($ords[$i] + ($i == $state - 1)) * pow(85, 4 - $i);
|
||||
for ($i = 0; $i < $state - 1; $i++)
|
||||
$ouput .= chr($sum >> ((3 - $i) * 8));
|
||||
}
|
||||
|
||||
return $output;
|
||||
}
|
||||
|
||||
function decodeFlate($input) {
|
||||
return gzuncompress($input);
|
||||
}
|
||||
|
||||
function getObjectOptions($object) {
|
||||
$options = array();
|
||||
|
||||
if (preg_match("#<<(.*)>>#ismU", $object, $options)) {
|
||||
$options = explode("/", $options[1]);
|
||||
@array_shift($options);
|
||||
|
||||
$o = array();
|
||||
for ($j = 0; $j < @count($options); $j++) {
|
||||
$options[$j] = preg_replace("#\s+#", " ", trim($options[$j]));
|
||||
if (strpos($options[$j], " ") !== false) {
|
||||
$parts = explode(" ", $options[$j]);
|
||||
$o[$parts[0]] = $parts[1];
|
||||
} else
|
||||
$o[$options[$j]] = true;
|
||||
}
|
||||
$options = $o;
|
||||
unset($o);
|
||||
}
|
||||
|
||||
return $options;
|
||||
}
|
||||
|
||||
function getDecodedStream($stream, $options) {
|
||||
$data = "";
|
||||
if (empty($options["Filter"]))
|
||||
$data = $stream;
|
||||
else {
|
||||
$length = !empty($options["Length"]) ? $options["Length"] : strlen($stream);
|
||||
$_stream = substr($stream, 0, $length);
|
||||
|
||||
foreach ($options as $key => $value) {
|
||||
if ($key == "ASCIIHexDecode")
|
||||
$_stream = $this->decodeAsciiHex($_stream);
|
||||
if ($key == "ASCII85Decode")
|
||||
$_stream = $this->decodeAscii85($_stream);
|
||||
if ($key == "FlateDecode")
|
||||
$_stream = $this->decodeFlate($_stream);
|
||||
if ($key == "Crypt") { // TO DO
|
||||
}
|
||||
}
|
||||
$data = $_stream;
|
||||
}
|
||||
return $data;
|
||||
}
|
||||
function getDirtyTexts(&$texts, $textContainers) {
|
||||
|
||||
for ($j = 0; $j < count($textContainers); $j++) {
|
||||
if (preg_match_all("#\[(.*)\]\s*TJ[\n|\r]#ismU", $textContainers[$j], $parts))
|
||||
$texts = array_merge($texts, @$parts[1]);
|
||||
elseif(preg_match_all("#T[d|w|m|f]\s*(\(.*\))\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts))
|
||||
$texts = array_merge($texts, @$parts[1]);
|
||||
elseif(preg_match_all("#T[d|w|m|f]\s*(\[.*\])\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts))
|
||||
$texts = array_merge($texts, @$parts[1]);
|
||||
}
|
||||
}
|
||||
function getCharTransformations(&$transformations, $stream) {
|
||||
preg_match_all("#([0-9]+)\s+beginbfchar(.*)endbfchar#ismU", $stream, $chars, PREG_SET_ORDER);
|
||||
preg_match_all("#([0-9]+)\s+beginbfrange(.*)endbfrange#ismU", $stream, $ranges, PREG_SET_ORDER);
|
||||
|
||||
for ($j = 0; $j < count($chars); $j++) {
|
||||
$count = $chars[$j][1];
|
||||
$current = explode("\n", trim($chars[$j][2]));
|
||||
for ($k = 0; $k < $count && $k < count($current); $k++) {
|
||||
if (preg_match("#<([0-9a-f]{2,4})>\s+<([0-9a-f]{4,512})>#is", trim($current[$k]), $map))
|
||||
$transformations[str_pad($map[1], 4, "0")] = $map[2];
|
||||
}
|
||||
}
|
||||
for ($j = 0; $j < count($ranges); $j++) {
|
||||
$count = $ranges[$j][1];
|
||||
$current = explode("\n", trim($ranges[$j][2]));
|
||||
for ($k = 0; $k < $count && $k < count($current); $k++) {
|
||||
if (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+<([0-9a-f]{4})>#is", trim($current[$k]), $map)) {
|
||||
$from = hexdec($map[1]);
|
||||
$to = hexdec($map[2]);
|
||||
$_from = hexdec($map[3]);
|
||||
|
||||
for ($m = $from, $n = 0; $m <= $to; $m++, $n++)
|
||||
$transformations[sprintf("%04X", $m)] = sprintf("%04X", $_from + $n);
|
||||
} elseif (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+\[(.*)\]#ismU", trim($current[$k]), $map)) {
|
||||
$from = hexdec($map[1]);
|
||||
$to = hexdec($map[2]);
|
||||
$parts = preg_split("#\s+#", trim($map[3]));
|
||||
|
||||
for ($m = $from, $n = 0; $m <= $to && $n < count($parts); $m++, $n++)
|
||||
$transformations[sprintf("%04X", $m)] = sprintf("%04X", hexdec($parts[$n]));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
function getTextUsingTransformations($texts, $transformations) {
|
||||
$document = "";
|
||||
for ($i = 0; $i < count($texts); $i++) {
|
||||
$isHex = false;
|
||||
$isPlain = false;
|
||||
|
||||
$hex = "";
|
||||
$plain = "";
|
||||
for ($j = 0; $j < strlen($texts[$i]); $j++) {
|
||||
$c = $texts[$i][$j];
|
||||
switch($c) {
|
||||
case "<":
|
||||
$hex = "";
|
||||
$isHex = true;
|
||||
break;
|
||||
case ">":
|
||||
$hexs = str_split($hex, $this->multibyte); // 2 or 4 (UTF8 or ISO)
|
||||
for ($k = 0; $k < count($hexs); $k++) {
|
||||
$chex = str_pad($hexs[$k], 4, "0"); // Add tailing zero
|
||||
if (isset($transformations[$chex]))
|
||||
$chex = $transformations[$chex];
|
||||
$document .= html_entity_decode("&#x".$chex.";");
|
||||
}
|
||||
$isHex = false;
|
||||
break;
|
||||
case "(":
|
||||
$plain = "";
|
||||
$isPlain = true;
|
||||
break;
|
||||
case ")":
|
||||
$document .= $plain;
|
||||
$isPlain = false;
|
||||
break;
|
||||
case "\\":
|
||||
$c2 = $texts[$i][$j + 1];
|
||||
if (in_array($c2, array("\\", "(", ")"))) $plain .= $c2;
|
||||
elseif ($c2 == "n") $plain .= '\n';
|
||||
elseif ($c2 == "r") $plain .= '\r';
|
||||
elseif ($c2 == "t") $plain .= '\t';
|
||||
elseif ($c2 == "b") $plain .= '\b';
|
||||
elseif ($c2 == "f") $plain .= '\f';
|
||||
elseif ($c2 >= '0' && $c2 <= '9') {
|
||||
$oct = preg_replace("#[^0-9]#", "", substr($texts[$i], $j + 1, 3));
|
||||
$j += strlen($oct) - 1;
|
||||
$plain .= html_entity_decode("&#".octdec($oct).";", $this->convertquotes);
|
||||
}
|
||||
$j++;
|
||||
break;
|
||||
|
||||
default:
|
||||
if ($isHex)
|
||||
$hex .= $c;
|
||||
if ($isPlain)
|
||||
$plain .= $c;
|
||||
break;
|
||||
}
|
||||
}
|
||||
$document .= "\n";
|
||||
}
|
||||
|
||||
return $document;
|
||||
}
|
||||
}
|
||||
?>
|
Loading…
Reference in New Issue