add the lib "PDF Parser" and change to use it

2019-09-18 17:48:46 -03:00 · 2019-09-18 17:48:46 -03:00 · fc7bb5250d
parent dbec21d502
commit fc7bb5250d
4 changed files with 6 additions and 386 deletions
--- a/composer.json
+++ b/composer.json
@ -3,7 +3,8 @@
    "description": "Transforme seu site wordpress em um repositório digital.",
    "type": "wordpress-plugin",
    "require": {
-        "respect/validation": "^1.1"
+        "respect/validation": "^1.1",
+        "smalot/pdfparser": "*"
    },
    "require-dev": {
        "squizlabs/php_codesniffer": "^2.2 || ^3.0.2",
--- a/src/classes/class-tainacan-media.php
+++ b/src/classes/class-tainacan-media.php
@ -254,12 +254,9 @@ class Media {
 			return $alternate;
 		}

-		$PDF2Text = new \PDF2Text();
-		$PDF2Text->setUnicode(true);
-		$PDF2Text->setFilename($file);
 		try {
-			$PDF2Text->decodePDF();
-			$content = preg_replace('~[[:cntrl:]]~', '', $PDF2Text->output());
+			$parser = new \Smalot\PdfParser\Parser();
+			$content    = $parser->parseFile($file)->getText();

 			$wp_charset = get_bloginfo('charset');
 			$content_charset = mb_detect_encoding($content);
--- a/src/classes/libs/class-pdf2text.php
+++ b/src/classes/libs/class-pdf2text.php
@ -1,377 +0,0 @@
-<?php
-/*
-
-Source: https://gist.github.com/getive/78b62d42cebd79b5dfd6
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
-This code is an improved version of what can be found at:
-http://www.webcheatsheet.com/php/reading_clean_text_from_pdf.php
-
-AUTHOR:
- Webcheatsheet.com (Original code)
- Joeri Stegeman (joeri210 [at] yahoo [dot] com) (Class conversion and fixes/adjustments)
-
-DESCRIPTION:
-This is a class to convert PDF files into ASCII text or so called PDF text extraction.
-It will ignore anything that is not addressed as text within the PDF and any layout.
-Currently supported filters are: ASCIIHexDecode, ASCII85Decode, FlateDecode
-
-PURPOSE(S):
-Most likely for people that want their PDF to be searchable.
-
-SYNTAX:
-include('class.pdf2text.php');
-$a = new PDF2Text();
-$a->setFilename('test.pdf');
-$a->decodePDF();
-echo $a->output(); 
-
-ALTERNATIVES:
-Other excellent options to search within a PDF:
- Apache PDFbox (http://pdfbox.apache.org/). An open source Java solution
- pdflib TET (http://www.pdflib.com/products/tet/)
- Online converter: http://snowtide.com/PDFTextStream
-*/
-
-
-class PDF2Text {
-    // Some settings
-    var $multibyte = 2; // Use setUnicode(TRUE|FALSE)
-    var $convertquotes = ENT_QUOTES; // ENT_COMPAT (double-quotes), ENT_QUOTES (Both), ENT_NOQUOTES (None)
-    
-    // Variables
-    var $filename = '';
-    var $decodedtext = '';
-    
-    function setFilename($filename) { 
-        // Reset
-        $this->decodedtext = '';
-        $this->filename = $filename;
-    }
-
-    function output($echo = false) { 
-        if($echo) echo $this->decodedtext;
-        else return $this->decodedtext;
-    }
-
-    function setUnicode($input) { 
-        // 4 for unicode. But 2 should work in most cases just fine
-        if($input == true) $this->multibyte = 4;
-        else $this->multibyte = 2;
-    }
-
-    function decodePDF() { 
-        // Read the data from pdf file
-        $infile = @file_get_contents($this->filename, FILE_BINARY); 
-        if (empty($infile)) 
-            return ""; 
-    
-        // Get all text data.
-        $transformations = array(); 
-        $texts = array(); 
-    
-        // Get the list of all objects.
-        preg_match_all("#obj[\n|\r](.*)endobj[\n|\r]#ismU", $infile, $objects); 
-        $objects = @$objects[1]; 
-    
-        // Select objects with streams.
-        for ($i = 0; $i < count($objects); $i++) { 
-            $currentObject = $objects[$i]; 
-    
-            // Check if an object includes data stream.
-            if (preg_match("#stream[\n|\r](.*)endstream[\n|\r]#ismU", $currentObject, $stream)) { 
-                $stream = ltrim($stream[1]); 
-    
-                // Check object parameters and look for text data. 
-                $options = $this->getObjectOptions($currentObject); 
-    
-                if (!(empty($options["Length1"]) && empty($options["Type"]) && empty($options["Subtype"]))) 
-                    continue; 
-    
-                // Hack, length doesnt always seem to be correct
-                unset($options["Length"]);
-    
-                // So, we have text data. Decode it.
-                $data = $this->getDecodedStream($stream, $options);  
-    
-                if (strlen($data)) { 
-                    if (preg_match_all("#BT[\n|\r](.*)ET[\n|\r]#ismU", $data, $textContainers)) {
-                        $textContainers = @$textContainers[1]; 
-                        $this->getDirtyTexts($texts, $textContainers); 
-                    } else 
-                        $this->getCharTransformations($transformations, $data); 
-                } 
-            } 
-        } 
-    
-        // Analyze text blocks taking into account character transformations and return results. 
-        $this->decodedtext = $this->getTextUsingTransformations($texts, $transformations); 
-    }
-
-
-    function decodeAsciiHex($input) {
-        $output = "";
-    
-        $isOdd = true;
-        $isComment = false;
-    
-        for($i = 0, $codeHigh = -1; $i < strlen($input) && $input[$i] != '>'; $i++) {
-            $c = $input[$i];
-    
-            if($isComment) {
-                if ($c == '\r' || $c == '\n')
-                    $isComment = false;
-                continue;
-            }
-    
-            switch($c) {
-                case '\0': case '\t': case '\r': case '\f': case '\n': case ' ': break;
-                case '%': 
-                    $isComment = true;
-                break;
-    
-                default:
-                    $code = hexdec($c);
-                    if($code === 0 && $c != '0')
-                        return "";
-    
-                    if($isOdd)
-                        $codeHigh = $code;
-                    else
-                        $output .= chr($codeHigh * 16 + $code);
-    
-                    $isOdd = !$isOdd;
-                break;
-            }
-        }
-    
-        if($input[$i] != '>')
-            return "";
-    
-        if($isOdd)
-            $output .= chr($codeHigh * 16);
-    
-        return $output;
-    }
-    
-    function decodeAscii85($input) {
-        $output = "";
-    
-        $isComment = false;
-        $ords = array();
-        
-        for($i = 0, $state = 0; $i < strlen($input) && $input[$i] != '~'; $i++) {
-            $c = $input[$i];
-    
-            if($isComment) {
-                if ($c == '\r' || $c == '\n')
-                    $isComment = false;
-                continue;
-            }
-    
-            if ($c == '\0' || $c == '\t' || $c == '\r' || $c == '\f' || $c == '\n' || $c == ' ')
-                continue;
-            if ($c == '%') {
-                $isComment = true;
-                continue;
-            }
-            if ($c == 'z' && $state === 0) {
-                $output .= str_repeat(chr(0), 4);
-                continue;
-            }
-            if ($c < '!' || $c > 'u')
-                return "";
-    
-            $code = ord($input[$i]) & 0xff;
-            $ords[$state++] = $code - ord('!');
-    
-            if ($state == 5) {
-                $state = 0;
-                for ($sum = 0, $j = 0; $j < 5; $j++)
-                    $sum = $sum * 85 + $ords[$j];
-                for ($j = 3; $j >= 0; $j--)
-                    $output .= chr($sum >> ($j * 8));
-            }
-        }
-        if ($state === 1)
-            return "";
-        elseif ($state > 1) {
-            for ($i = 0, $sum = 0; $i < $state; $i++)
-                $sum += ($ords[$i] + ($i == $state - 1)) * pow(85, 4 - $i);
-            for ($i = 0; $i < $state - 1; $i++)
-                $ouput .= chr($sum >> ((3 - $i) * 8));
-        }
-    
-        return $output;
-    }
-    
-    function decodeFlate($input) {
-        return gzuncompress($input);
-    }
-    
-    function getObjectOptions($object) {
-        $options = array();
-
-        if (preg_match("#<<(.*)>>#ismU", $object, $options)) {
-            $options = explode("/", $options[1]);
-            @array_shift($options);
-    
-            $o = array();
-            for ($j = 0; $j < @count($options); $j++) {
-                $options[$j] = preg_replace("#\s+#", " ", trim($options[$j]));
-                if (strpos($options[$j], " ") !== false) {
-                    $parts = explode(" ", $options[$j]);
-                    $o[$parts[0]] = $parts[1];
-                } else
-                    $o[$options[$j]] = true;
-            }
-            $options = $o;
-            unset($o);
-        }
-    
-        return $options;
-    }
-    
-    function getDecodedStream($stream, $options) {
-        $data = "";
-        if (empty($options["Filter"]))
-            $data = $stream;
-        else {
-            $length = !empty($options["Length"]) ? $options["Length"] : strlen($stream);
-            $_stream = substr($stream, 0, $length);
-    
-            foreach ($options as $key => $value) {
-                if ($key == "ASCIIHexDecode")
-                    $_stream = $this->decodeAsciiHex($_stream);
-                if ($key == "ASCII85Decode")
-                    $_stream = $this->decodeAscii85($_stream);
-                if ($key == "FlateDecode")
-                    $_stream = $this->decodeFlate($_stream);
-                if ($key == "Crypt") { // TO DO
-                }
-            }
-            $data = $_stream;
-        }
-        return $data;
-    }
-    function getDirtyTexts(&$texts, $textContainers) {
-        
-        for ($j = 0; $j < count($textContainers); $j++) {
-            if (preg_match_all("#\[(.*)\]\s*TJ[\n|\r]#ismU", $textContainers[$j], $parts))
-                $texts = array_merge($texts, @$parts[1]);
-            elseif(preg_match_all("#T[d|w|m|f]\s*(\(.*\))\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts))
-                $texts = array_merge($texts, @$parts[1]);
-            elseif(preg_match_all("#T[d|w|m|f]\s*(\[.*\])\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts))
-                $texts = array_merge($texts, @$parts[1]);
-        }
-    }
-    function getCharTransformations(&$transformations, $stream) {
-        preg_match_all("#([0-9]+)\s+beginbfchar(.*)endbfchar#ismU", $stream, $chars, PREG_SET_ORDER);
-        preg_match_all("#([0-9]+)\s+beginbfrange(.*)endbfrange#ismU", $stream, $ranges, PREG_SET_ORDER);
-    
-        for ($j = 0; $j < count($chars); $j++) {
-            $count = $chars[$j][1];
-            $current = explode("\n", trim($chars[$j][2]));
-            for ($k = 0; $k < $count && $k < count($current); $k++) {
-                if (preg_match("#<([0-9a-f]{2,4})>\s+<([0-9a-f]{4,512})>#is", trim($current[$k]), $map))
-                    $transformations[str_pad($map[1], 4, "0")] = $map[2];
-            }
-        }
-        for ($j = 0; $j < count($ranges); $j++) {
-            $count = $ranges[$j][1];
-            $current = explode("\n", trim($ranges[$j][2]));
-            for ($k = 0; $k < $count && $k < count($current); $k++) {
-                if (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+<([0-9a-f]{4})>#is", trim($current[$k]), $map)) {
-                    $from = hexdec($map[1]);
-                    $to = hexdec($map[2]);
-                    $_from = hexdec($map[3]);
-    
-                    for ($m = $from, $n = 0; $m <= $to; $m++, $n++)
-                        $transformations[sprintf("%04X", $m)] = sprintf("%04X", $_from + $n);
-                } elseif (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+\[(.*)\]#ismU", trim($current[$k]), $map)) {
-                    $from = hexdec($map[1]);
-                    $to = hexdec($map[2]);
-                    $parts = preg_split("#\s+#", trim($map[3]));
-                    
-                    for ($m = $from, $n = 0; $m <= $to && $n < count($parts); $m++, $n++)
-                        $transformations[sprintf("%04X", $m)] = sprintf("%04X", hexdec($parts[$n]));
-                }
-            }
-        }
-    }
-    function getTextUsingTransformations($texts, $transformations) {
-        $document = "";
-        for ($i = 0; $i < count($texts); $i++) {
-            $isHex = false;
-            $isPlain = false;
-    
-            $hex = "";
-            $plain = "";
-            for ($j = 0; $j < strlen($texts[$i]); $j++) {
-                $c = $texts[$i][$j];
-                switch($c) {
-                    case "<":
-                        $hex = "";
-                        $isHex = true;
-                    break;
-                    case ">":
-                        $hexs = str_split($hex, $this->multibyte); // 2 or 4 (UTF8 or ISO)
-                        for ($k = 0; $k < count($hexs); $k++) {
-                            $chex = str_pad($hexs[$k], 4, "0"); // Add tailing zero
-                            if (isset($transformations[$chex]))
-                                $chex = $transformations[$chex];
-                            $document .= html_entity_decode("&#x".$chex.";");
-                        }
-                        $isHex = false;
-                    break;
-                    case "(":
-                        $plain = "";
-                        $isPlain = true;
-                    break;
-                    case ")":
-                        $document .= $plain;
-                        $isPlain = false;
-                    break;
-                    case "\\":
-                        $c2 = $texts[$i][$j + 1];
-                        if (in_array($c2, array("\\", "(", ")"))) $plain .= $c2;
-                        elseif ($c2 == "n") $plain .= '\n';
-                        elseif ($c2 == "r") $plain .= '\r';
-                        elseif ($c2 == "t") $plain .= '\t';
-                        elseif ($c2 == "b") $plain .= '\b';
-                        elseif ($c2 == "f") $plain .= '\f';
-                        elseif ($c2 >= '0' && $c2 <= '9') {
-                            $oct = preg_replace("#[^0-9]#", "", substr($texts[$i], $j + 1, 3));
-                            $j += strlen($oct) - 1;
-                            $plain .= html_entity_decode("&#".octdec($oct).";", $this->convertquotes);
-                        }
-                        $j++;
-                    break;
-    
-                    default:
-                        if ($isHex)
-                            $hex .= $c;
-                        if ($isPlain)
-                            $plain .= $c;
-                    break;
-                }
-            }
-            $document .= "\n";
-        }
-    
-        return $document;
-    }
-}
-?>
--- a/src/classes/tainacan-creator.php
+++ b/src/classes/tainacan-creator.php
@ -34,7 +34,6 @@ const DIRS = [

 require_once('libs/wp-async-request.php');
 require_once('libs/wp-background-process.php');
-require_once('libs/class-pdf2text.php');
 require_once('class-tainacan-background-process.php');
 require_once('tainacan-utils.php');
 require_once(TAINACAN_IMPORTER_DIR . 'class-tainacan-bg-importer.php');