diff --git a/composer.json b/composer.json index ae246dec5..d03574e01 100644 --- a/composer.json +++ b/composer.json @@ -3,7 +3,8 @@ "description": "Transforme seu site wordpress em um repositório digital.", "type": "wordpress-plugin", "require": { - "respect/validation": "^1.1" + "respect/validation": "^1.1", + "smalot/pdfparser": "*" }, "require-dev": { "squizlabs/php_codesniffer": "^2.2 || ^3.0.2", diff --git a/src/classes/class-tainacan-media.php b/src/classes/class-tainacan-media.php index 730ca9e7f..0e8ad8c3d 100644 --- a/src/classes/class-tainacan-media.php +++ b/src/classes/class-tainacan-media.php @@ -229,7 +229,7 @@ class Media { } public function index_pdf_content($file, $item_id) { - + $content_index_meta = '_document_content_index'; if (defined('TAINACAN_CONTENT_PDF_INDEX_METADATA')) { $content_index_meta = TAINACAN_CONTENT_PDF_INDEX_METADATA; @@ -239,7 +239,7 @@ class Media { $meta_id = update_post_meta( $item_id, $content_index_meta, null ); return true; } - + if ( ! \file_exists($file) ) { return false; } @@ -254,12 +254,9 @@ class Media { return $alternate; } - $PDF2Text = new \PDF2Text(); - $PDF2Text->setUnicode(true); - $PDF2Text->setFilename($file); try { - $PDF2Text->decodePDF(); - $content = preg_replace('~[[:cntrl:]]~', '', $PDF2Text->output()); + $parser = new \Smalot\PdfParser\Parser(); + $content = $parser->parseFile($file)->getText(); $wp_charset = get_bloginfo('charset'); $content_charset = mb_detect_encoding($content); diff --git a/src/classes/libs/class-pdf2text.php b/src/classes/libs/class-pdf2text.php deleted file mode 100644 index c46b51691..000000000 --- a/src/classes/libs/class-pdf2text.php +++ /dev/null @@ -1,377 +0,0 @@ -setFilename('test.pdf'); -$a->decodePDF(); -echo $a->output(); - -ALTERNATIVES: -Other excellent options to search within a PDF: -- Apache PDFbox (http://pdfbox.apache.org/). An open source Java solution -- pdflib TET (http://www.pdflib.com/products/tet/) -- Online converter: http://snowtide.com/PDFTextStream -*/ - - -class PDF2Text { - // Some settings - var $multibyte = 2; // Use setUnicode(TRUE|FALSE) - var $convertquotes = ENT_QUOTES; // ENT_COMPAT (double-quotes), ENT_QUOTES (Both), ENT_NOQUOTES (None) - - // Variables - var $filename = ''; - var $decodedtext = ''; - - function setFilename($filename) { - // Reset - $this->decodedtext = ''; - $this->filename = $filename; - } - - function output($echo = false) { - if($echo) echo $this->decodedtext; - else return $this->decodedtext; - } - - function setUnicode($input) { - // 4 for unicode. But 2 should work in most cases just fine - if($input == true) $this->multibyte = 4; - else $this->multibyte = 2; - } - - function decodePDF() { - // Read the data from pdf file - $infile = @file_get_contents($this->filename, FILE_BINARY); - if (empty($infile)) - return ""; - - // Get all text data. - $transformations = array(); - $texts = array(); - - // Get the list of all objects. - preg_match_all("#obj[\n|\r](.*)endobj[\n|\r]#ismU", $infile, $objects); - $objects = @$objects[1]; - - // Select objects with streams. - for ($i = 0; $i < count($objects); $i++) { - $currentObject = $objects[$i]; - - // Check if an object includes data stream. - if (preg_match("#stream[\n|\r](.*)endstream[\n|\r]#ismU", $currentObject, $stream)) { - $stream = ltrim($stream[1]); - - // Check object parameters and look for text data. - $options = $this->getObjectOptions($currentObject); - - if (!(empty($options["Length1"]) && empty($options["Type"]) && empty($options["Subtype"]))) - continue; - - // Hack, length doesnt always seem to be correct - unset($options["Length"]); - - // So, we have text data. Decode it. - $data = $this->getDecodedStream($stream, $options); - - if (strlen($data)) { - if (preg_match_all("#BT[\n|\r](.*)ET[\n|\r]#ismU", $data, $textContainers)) { - $textContainers = @$textContainers[1]; - $this->getDirtyTexts($texts, $textContainers); - } else - $this->getCharTransformations($transformations, $data); - } - } - } - - // Analyze text blocks taking into account character transformations and return results. - $this->decodedtext = $this->getTextUsingTransformations($texts, $transformations); - } - - - function decodeAsciiHex($input) { - $output = ""; - - $isOdd = true; - $isComment = false; - - for($i = 0, $codeHigh = -1; $i < strlen($input) && $input[$i] != '>'; $i++) { - $c = $input[$i]; - - if($isComment) { - if ($c == '\r' || $c == '\n') - $isComment = false; - continue; - } - - switch($c) { - case '\0': case '\t': case '\r': case '\f': case '\n': case ' ': break; - case '%': - $isComment = true; - break; - - default: - $code = hexdec($c); - if($code === 0 && $c != '0') - return ""; - - if($isOdd) - $codeHigh = $code; - else - $output .= chr($codeHigh * 16 + $code); - - $isOdd = !$isOdd; - break; - } - } - - if($input[$i] != '>') - return ""; - - if($isOdd) - $output .= chr($codeHigh * 16); - - return $output; - } - - function decodeAscii85($input) { - $output = ""; - - $isComment = false; - $ords = array(); - - for($i = 0, $state = 0; $i < strlen($input) && $input[$i] != '~'; $i++) { - $c = $input[$i]; - - if($isComment) { - if ($c == '\r' || $c == '\n') - $isComment = false; - continue; - } - - if ($c == '\0' || $c == '\t' || $c == '\r' || $c == '\f' || $c == '\n' || $c == ' ') - continue; - if ($c == '%') { - $isComment = true; - continue; - } - if ($c == 'z' && $state === 0) { - $output .= str_repeat(chr(0), 4); - continue; - } - if ($c < '!' || $c > 'u') - return ""; - - $code = ord($input[$i]) & 0xff; - $ords[$state++] = $code - ord('!'); - - if ($state == 5) { - $state = 0; - for ($sum = 0, $j = 0; $j < 5; $j++) - $sum = $sum * 85 + $ords[$j]; - for ($j = 3; $j >= 0; $j--) - $output .= chr($sum >> ($j * 8)); - } - } - if ($state === 1) - return ""; - elseif ($state > 1) { - for ($i = 0, $sum = 0; $i < $state; $i++) - $sum += ($ords[$i] + ($i == $state - 1)) * pow(85, 4 - $i); - for ($i = 0; $i < $state - 1; $i++) - $ouput .= chr($sum >> ((3 - $i) * 8)); - } - - return $output; - } - - function decodeFlate($input) { - return gzuncompress($input); - } - - function getObjectOptions($object) { - $options = array(); - - if (preg_match("#<<(.*)>>#ismU", $object, $options)) { - $options = explode("/", $options[1]); - @array_shift($options); - - $o = array(); - for ($j = 0; $j < @count($options); $j++) { - $options[$j] = preg_replace("#\s+#", " ", trim($options[$j])); - if (strpos($options[$j], " ") !== false) { - $parts = explode(" ", $options[$j]); - $o[$parts[0]] = $parts[1]; - } else - $o[$options[$j]] = true; - } - $options = $o; - unset($o); - } - - return $options; - } - - function getDecodedStream($stream, $options) { - $data = ""; - if (empty($options["Filter"])) - $data = $stream; - else { - $length = !empty($options["Length"]) ? $options["Length"] : strlen($stream); - $_stream = substr($stream, 0, $length); - - foreach ($options as $key => $value) { - if ($key == "ASCIIHexDecode") - $_stream = $this->decodeAsciiHex($_stream); - if ($key == "ASCII85Decode") - $_stream = $this->decodeAscii85($_stream); - if ($key == "FlateDecode") - $_stream = $this->decodeFlate($_stream); - if ($key == "Crypt") { // TO DO - } - } - $data = $_stream; - } - return $data; - } - function getDirtyTexts(&$texts, $textContainers) { - - for ($j = 0; $j < count($textContainers); $j++) { - if (preg_match_all("#\[(.*)\]\s*TJ[\n|\r]#ismU", $textContainers[$j], $parts)) - $texts = array_merge($texts, @$parts[1]); - elseif(preg_match_all("#T[d|w|m|f]\s*(\(.*\))\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts)) - $texts = array_merge($texts, @$parts[1]); - elseif(preg_match_all("#T[d|w|m|f]\s*(\[.*\])\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts)) - $texts = array_merge($texts, @$parts[1]); - } - } - function getCharTransformations(&$transformations, $stream) { - preg_match_all("#([0-9]+)\s+beginbfchar(.*)endbfchar#ismU", $stream, $chars, PREG_SET_ORDER); - preg_match_all("#([0-9]+)\s+beginbfrange(.*)endbfrange#ismU", $stream, $ranges, PREG_SET_ORDER); - - for ($j = 0; $j < count($chars); $j++) { - $count = $chars[$j][1]; - $current = explode("\n", trim($chars[$j][2])); - for ($k = 0; $k < $count && $k < count($current); $k++) { - if (preg_match("#<([0-9a-f]{2,4})>\s+<([0-9a-f]{4,512})>#is", trim($current[$k]), $map)) - $transformations[str_pad($map[1], 4, "0")] = $map[2]; - } - } - for ($j = 0; $j < count($ranges); $j++) { - $count = $ranges[$j][1]; - $current = explode("\n", trim($ranges[$j][2])); - for ($k = 0; $k < $count && $k < count($current); $k++) { - if (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+<([0-9a-f]{4})>#is", trim($current[$k]), $map)) { - $from = hexdec($map[1]); - $to = hexdec($map[2]); - $_from = hexdec($map[3]); - - for ($m = $from, $n = 0; $m <= $to; $m++, $n++) - $transformations[sprintf("%04X", $m)] = sprintf("%04X", $_from + $n); - } elseif (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+\[(.*)\]#ismU", trim($current[$k]), $map)) { - $from = hexdec($map[1]); - $to = hexdec($map[2]); - $parts = preg_split("#\s+#", trim($map[3])); - - for ($m = $from, $n = 0; $m <= $to && $n < count($parts); $m++, $n++) - $transformations[sprintf("%04X", $m)] = sprintf("%04X", hexdec($parts[$n])); - } - } - } - } - function getTextUsingTransformations($texts, $transformations) { - $document = ""; - for ($i = 0; $i < count($texts); $i++) { - $isHex = false; - $isPlain = false; - - $hex = ""; - $plain = ""; - for ($j = 0; $j < strlen($texts[$i]); $j++) { - $c = $texts[$i][$j]; - switch($c) { - case "<": - $hex = ""; - $isHex = true; - break; - case ">": - $hexs = str_split($hex, $this->multibyte); // 2 or 4 (UTF8 or ISO) - for ($k = 0; $k < count($hexs); $k++) { - $chex = str_pad($hexs[$k], 4, "0"); // Add tailing zero - if (isset($transformations[$chex])) - $chex = $transformations[$chex]; - $document .= html_entity_decode("&#x".$chex.";"); - } - $isHex = false; - break; - case "(": - $plain = ""; - $isPlain = true; - break; - case ")": - $document .= $plain; - $isPlain = false; - break; - case "\\": - $c2 = $texts[$i][$j + 1]; - if (in_array($c2, array("\\", "(", ")"))) $plain .= $c2; - elseif ($c2 == "n") $plain .= '\n'; - elseif ($c2 == "r") $plain .= '\r'; - elseif ($c2 == "t") $plain .= '\t'; - elseif ($c2 == "b") $plain .= '\b'; - elseif ($c2 == "f") $plain .= '\f'; - elseif ($c2 >= '0' && $c2 <= '9') { - $oct = preg_replace("#[^0-9]#", "", substr($texts[$i], $j + 1, 3)); - $j += strlen($oct) - 1; - $plain .= html_entity_decode("&#".octdec($oct).";", $this->convertquotes); - } - $j++; - break; - - default: - if ($isHex) - $hex .= $c; - if ($isPlain) - $plain .= $c; - break; - } - } - $document .= "\n"; - } - - return $document; - } -} -?> \ No newline at end of file diff --git a/src/classes/tainacan-creator.php b/src/classes/tainacan-creator.php index 5c513b00f..964f38ee8 100644 --- a/src/classes/tainacan-creator.php +++ b/src/classes/tainacan-creator.php @@ -34,7 +34,6 @@ const DIRS = [ require_once('libs/wp-async-request.php'); require_once('libs/wp-background-process.php'); -require_once('libs/class-pdf2text.php'); require_once('class-tainacan-background-process.php'); require_once('tainacan-utils.php'); require_once(TAINACAN_IMPORTER_DIR . 'class-tainacan-bg-importer.php');