From 2e2b6f93f338481d68ac744f4250e5137049b58b Mon Sep 17 00:00:00 2001 From: leogermani Date: Thu, 8 Aug 2019 11:51:51 -0300 Subject: [PATCH 01/12] starting implementation of index pdf contents using pdf2text class #245 --- src/classes/class-tainacan-media.php | 26 ++ src/classes/libs/class-pdf2text.php | 377 +++++++++++++++++++++++++++ 2 files changed, 403 insertions(+) create mode 100644 src/classes/libs/class-pdf2text.php diff --git a/src/classes/class-tainacan-media.php b/src/classes/class-tainacan-media.php index e8679da87..b8fba368b 100644 --- a/src/classes/class-tainacan-media.php +++ b/src/classes/class-tainacan-media.php @@ -227,5 +227,31 @@ class Media { if( $this->THROW_EXCPTION_ON_FATAL_ERROR ) throw new \Exception("fatal error"); } + + public index_pdf_content($file, $item_id) { + + if ( ! \file_exists($file) ) { + return false; + } + + // Allow plugins to implement other approach to index pdf contents + $alternate = apply_filters('tainacan-index-pdf', null, $file, $item_id); + if ( ! \is_null($alternate) ) { + return $alternate; + } + + require_once( TAINACAN_CLASSES_DIR . '/lib/class-pdf2text.php' ); + + $PDF2Text = new PDF2Text(); + $PDF2Text->setFilename($file); + + try { + $PDF2Text->decodePDF(); + update_post_meta( $item_id, '_pdf_index', $PDF2Text->output() ); + } catch($e) { + return false; + } + + } } \ No newline at end of file diff --git a/src/classes/libs/class-pdf2text.php b/src/classes/libs/class-pdf2text.php new file mode 100644 index 000000000..c46b51691 --- /dev/null +++ b/src/classes/libs/class-pdf2text.php @@ -0,0 +1,377 @@ +setFilename('test.pdf'); +$a->decodePDF(); +echo $a->output(); + +ALTERNATIVES: +Other excellent options to search within a PDF: +- Apache PDFbox (http://pdfbox.apache.org/). An open source Java solution +- pdflib TET (http://www.pdflib.com/products/tet/) +- Online converter: http://snowtide.com/PDFTextStream +*/ + + +class PDF2Text { + // Some settings + var $multibyte = 2; // Use setUnicode(TRUE|FALSE) + var $convertquotes = ENT_QUOTES; // ENT_COMPAT (double-quotes), ENT_QUOTES (Both), ENT_NOQUOTES (None) + + // Variables + var $filename = ''; + var $decodedtext = ''; + + function setFilename($filename) { + // Reset + $this->decodedtext = ''; + $this->filename = $filename; + } + + function output($echo = false) { + if($echo) echo $this->decodedtext; + else return $this->decodedtext; + } + + function setUnicode($input) { + // 4 for unicode. But 2 should work in most cases just fine + if($input == true) $this->multibyte = 4; + else $this->multibyte = 2; + } + + function decodePDF() { + // Read the data from pdf file + $infile = @file_get_contents($this->filename, FILE_BINARY); + if (empty($infile)) + return ""; + + // Get all text data. + $transformations = array(); + $texts = array(); + + // Get the list of all objects. + preg_match_all("#obj[\n|\r](.*)endobj[\n|\r]#ismU", $infile, $objects); + $objects = @$objects[1]; + + // Select objects with streams. + for ($i = 0; $i < count($objects); $i++) { + $currentObject = $objects[$i]; + + // Check if an object includes data stream. + if (preg_match("#stream[\n|\r](.*)endstream[\n|\r]#ismU", $currentObject, $stream)) { + $stream = ltrim($stream[1]); + + // Check object parameters and look for text data. + $options = $this->getObjectOptions($currentObject); + + if (!(empty($options["Length1"]) && empty($options["Type"]) && empty($options["Subtype"]))) + continue; + + // Hack, length doesnt always seem to be correct + unset($options["Length"]); + + // So, we have text data. Decode it. + $data = $this->getDecodedStream($stream, $options); + + if (strlen($data)) { + if (preg_match_all("#BT[\n|\r](.*)ET[\n|\r]#ismU", $data, $textContainers)) { + $textContainers = @$textContainers[1]; + $this->getDirtyTexts($texts, $textContainers); + } else + $this->getCharTransformations($transformations, $data); + } + } + } + + // Analyze text blocks taking into account character transformations and return results. + $this->decodedtext = $this->getTextUsingTransformations($texts, $transformations); + } + + + function decodeAsciiHex($input) { + $output = ""; + + $isOdd = true; + $isComment = false; + + for($i = 0, $codeHigh = -1; $i < strlen($input) && $input[$i] != '>'; $i++) { + $c = $input[$i]; + + if($isComment) { + if ($c == '\r' || $c == '\n') + $isComment = false; + continue; + } + + switch($c) { + case '\0': case '\t': case '\r': case '\f': case '\n': case ' ': break; + case '%': + $isComment = true; + break; + + default: + $code = hexdec($c); + if($code === 0 && $c != '0') + return ""; + + if($isOdd) + $codeHigh = $code; + else + $output .= chr($codeHigh * 16 + $code); + + $isOdd = !$isOdd; + break; + } + } + + if($input[$i] != '>') + return ""; + + if($isOdd) + $output .= chr($codeHigh * 16); + + return $output; + } + + function decodeAscii85($input) { + $output = ""; + + $isComment = false; + $ords = array(); + + for($i = 0, $state = 0; $i < strlen($input) && $input[$i] != '~'; $i++) { + $c = $input[$i]; + + if($isComment) { + if ($c == '\r' || $c == '\n') + $isComment = false; + continue; + } + + if ($c == '\0' || $c == '\t' || $c == '\r' || $c == '\f' || $c == '\n' || $c == ' ') + continue; + if ($c == '%') { + $isComment = true; + continue; + } + if ($c == 'z' && $state === 0) { + $output .= str_repeat(chr(0), 4); + continue; + } + if ($c < '!' || $c > 'u') + return ""; + + $code = ord($input[$i]) & 0xff; + $ords[$state++] = $code - ord('!'); + + if ($state == 5) { + $state = 0; + for ($sum = 0, $j = 0; $j < 5; $j++) + $sum = $sum * 85 + $ords[$j]; + for ($j = 3; $j >= 0; $j--) + $output .= chr($sum >> ($j * 8)); + } + } + if ($state === 1) + return ""; + elseif ($state > 1) { + for ($i = 0, $sum = 0; $i < $state; $i++) + $sum += ($ords[$i] + ($i == $state - 1)) * pow(85, 4 - $i); + for ($i = 0; $i < $state - 1; $i++) + $ouput .= chr($sum >> ((3 - $i) * 8)); + } + + return $output; + } + + function decodeFlate($input) { + return gzuncompress($input); + } + + function getObjectOptions($object) { + $options = array(); + + if (preg_match("#<<(.*)>>#ismU", $object, $options)) { + $options = explode("/", $options[1]); + @array_shift($options); + + $o = array(); + for ($j = 0; $j < @count($options); $j++) { + $options[$j] = preg_replace("#\s+#", " ", trim($options[$j])); + if (strpos($options[$j], " ") !== false) { + $parts = explode(" ", $options[$j]); + $o[$parts[0]] = $parts[1]; + } else + $o[$options[$j]] = true; + } + $options = $o; + unset($o); + } + + return $options; + } + + function getDecodedStream($stream, $options) { + $data = ""; + if (empty($options["Filter"])) + $data = $stream; + else { + $length = !empty($options["Length"]) ? $options["Length"] : strlen($stream); + $_stream = substr($stream, 0, $length); + + foreach ($options as $key => $value) { + if ($key == "ASCIIHexDecode") + $_stream = $this->decodeAsciiHex($_stream); + if ($key == "ASCII85Decode") + $_stream = $this->decodeAscii85($_stream); + if ($key == "FlateDecode") + $_stream = $this->decodeFlate($_stream); + if ($key == "Crypt") { // TO DO + } + } + $data = $_stream; + } + return $data; + } + function getDirtyTexts(&$texts, $textContainers) { + + for ($j = 0; $j < count($textContainers); $j++) { + if (preg_match_all("#\[(.*)\]\s*TJ[\n|\r]#ismU", $textContainers[$j], $parts)) + $texts = array_merge($texts, @$parts[1]); + elseif(preg_match_all("#T[d|w|m|f]\s*(\(.*\))\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts)) + $texts = array_merge($texts, @$parts[1]); + elseif(preg_match_all("#T[d|w|m|f]\s*(\[.*\])\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts)) + $texts = array_merge($texts, @$parts[1]); + } + } + function getCharTransformations(&$transformations, $stream) { + preg_match_all("#([0-9]+)\s+beginbfchar(.*)endbfchar#ismU", $stream, $chars, PREG_SET_ORDER); + preg_match_all("#([0-9]+)\s+beginbfrange(.*)endbfrange#ismU", $stream, $ranges, PREG_SET_ORDER); + + for ($j = 0; $j < count($chars); $j++) { + $count = $chars[$j][1]; + $current = explode("\n", trim($chars[$j][2])); + for ($k = 0; $k < $count && $k < count($current); $k++) { + if (preg_match("#<([0-9a-f]{2,4})>\s+<([0-9a-f]{4,512})>#is", trim($current[$k]), $map)) + $transformations[str_pad($map[1], 4, "0")] = $map[2]; + } + } + for ($j = 0; $j < count($ranges); $j++) { + $count = $ranges[$j][1]; + $current = explode("\n", trim($ranges[$j][2])); + for ($k = 0; $k < $count && $k < count($current); $k++) { + if (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+<([0-9a-f]{4})>#is", trim($current[$k]), $map)) { + $from = hexdec($map[1]); + $to = hexdec($map[2]); + $_from = hexdec($map[3]); + + for ($m = $from, $n = 0; $m <= $to; $m++, $n++) + $transformations[sprintf("%04X", $m)] = sprintf("%04X", $_from + $n); + } elseif (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+\[(.*)\]#ismU", trim($current[$k]), $map)) { + $from = hexdec($map[1]); + $to = hexdec($map[2]); + $parts = preg_split("#\s+#", trim($map[3])); + + for ($m = $from, $n = 0; $m <= $to && $n < count($parts); $m++, $n++) + $transformations[sprintf("%04X", $m)] = sprintf("%04X", hexdec($parts[$n])); + } + } + } + } + function getTextUsingTransformations($texts, $transformations) { + $document = ""; + for ($i = 0; $i < count($texts); $i++) { + $isHex = false; + $isPlain = false; + + $hex = ""; + $plain = ""; + for ($j = 0; $j < strlen($texts[$i]); $j++) { + $c = $texts[$i][$j]; + switch($c) { + case "<": + $hex = ""; + $isHex = true; + break; + case ">": + $hexs = str_split($hex, $this->multibyte); // 2 or 4 (UTF8 or ISO) + for ($k = 0; $k < count($hexs); $k++) { + $chex = str_pad($hexs[$k], 4, "0"); // Add tailing zero + if (isset($transformations[$chex])) + $chex = $transformations[$chex]; + $document .= html_entity_decode("&#x".$chex.";"); + } + $isHex = false; + break; + case "(": + $plain = ""; + $isPlain = true; + break; + case ")": + $document .= $plain; + $isPlain = false; + break; + case "\\": + $c2 = $texts[$i][$j + 1]; + if (in_array($c2, array("\\", "(", ")"))) $plain .= $c2; + elseif ($c2 == "n") $plain .= '\n'; + elseif ($c2 == "r") $plain .= '\r'; + elseif ($c2 == "t") $plain .= '\t'; + elseif ($c2 == "b") $plain .= '\b'; + elseif ($c2 == "f") $plain .= '\f'; + elseif ($c2 >= '0' && $c2 <= '9') { + $oct = preg_replace("#[^0-9]#", "", substr($texts[$i], $j + 1, 3)); + $j += strlen($oct) - 1; + $plain .= html_entity_decode("&#".octdec($oct).";", $this->convertquotes); + } + $j++; + break; + + default: + if ($isHex) + $hex .= $c; + if ($isPlain) + $plain .= $c; + break; + } + } + $document .= "\n"; + } + + return $document; + } +} +?> \ No newline at end of file From d5d50d7d605a3b486fb61d7897490091d199bb07 Mon Sep 17 00:00:00 2001 From: vnmedeiros Date: Fri, 6 Sep 2019 16:24:55 -0300 Subject: [PATCH 02/12] add and remove content on metadata if is a pdf document on item #245 --- src/classes/class-tainacan-media.php | 33 +++++++++++++++++++++------- src/classes/tainacan-creator.php | 1 + 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/src/classes/class-tainacan-media.php b/src/classes/class-tainacan-media.php index b8fba368b..584a93302 100644 --- a/src/classes/class-tainacan-media.php +++ b/src/classes/class-tainacan-media.php @@ -228,11 +228,25 @@ class Media { throw new \Exception("fatal error"); } - public index_pdf_content($file, $item_id) { - + public function index_pdf_content($file, $item_id) { + + $content_index_meta = '_pdf_content_index'; + if (defined('TAINACAN_CONTENT_PDF_INDEX_METADATA')) { + $content_index_meta = TAINACAN_CONTENT_PDF_INDEX_METADATA; + } + + if ($file == null) { + $meta_id = update_post_meta( $item_id, $content_index_meta, null ); + return true; + } + if ( ! \file_exists($file) ) { return false; } + + if ( $this->get_mime_content_type($file) != 'application/pdf') { + return null; + } // Allow plugins to implement other approach to index pdf contents $alternate = apply_filters('tainacan-index-pdf', null, $file, $item_id); @@ -240,18 +254,21 @@ class Media { return $alternate; } - require_once( TAINACAN_CLASSES_DIR . '/lib/class-pdf2text.php' ); - $PDF2Text = new PDF2Text(); + + $PDF2Text = new \PDF2Text(); $PDF2Text->setFilename($file); - try { $PDF2Text->decodePDF(); - update_post_meta( $item_id, '_pdf_index', $PDF2Text->output() ); - } catch($e) { + $content = preg_replace('/[^a-zA-Z0-9_ -]/s','',$PDF2Text->output()); // melhorar essa expresão regular + //$content = filter_var ( $PDF2Text->output(), FILTER_SANITIZE_STRING); + //$content = iconv('ISO-8859-1', 'UTF-8//TRANSLIT//IGNORE', $PDF2Text->output()); + //$content = preg_replace('/[\r\n\\n]+/', "\n", $content); + $meta_id = update_post_meta( $item_id, $content_index_meta, $content ); + } catch(Exception $e) { + error_log('Caught exception: ' . $e->getMessage() . "\n"); return false; } - } } \ No newline at end of file diff --git a/src/classes/tainacan-creator.php b/src/classes/tainacan-creator.php index 964f38ee8..5c513b00f 100644 --- a/src/classes/tainacan-creator.php +++ b/src/classes/tainacan-creator.php @@ -34,6 +34,7 @@ const DIRS = [ require_once('libs/wp-async-request.php'); require_once('libs/wp-background-process.php'); +require_once('libs/class-pdf2text.php'); require_once('class-tainacan-background-process.php'); require_once('tainacan-utils.php'); require_once(TAINACAN_IMPORTER_DIR . 'class-tainacan-bg-importer.php'); From afc24ee50b9901dcf13ed71337a03ff7c954ed6b Mon Sep 17 00:00:00 2001 From: vnmedeiros Date: Fri, 6 Sep 2019 16:25:11 -0300 Subject: [PATCH 03/12] add and remove content on metadata if is a pdf document on item # 245 --- .../repositories/class-tainacan-items.php | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/classes/repositories/class-tainacan-items.php b/src/classes/repositories/class-tainacan-items.php index 30ed2c549..ba4291e45 100644 --- a/src/classes/repositories/class-tainacan-items.php +++ b/src/classes/repositories/class-tainacan-items.php @@ -376,6 +376,25 @@ class Items extends Repository { return $where; } + /** + * generate a content of document to index. + * + * @param Entities\Item $item The item + * + * @return boolean + */ + public function generate_index_content(Entities\Item $item) { + $TainacanMedia = \Tainacan\Media::get_instance(); + if ( empty( $item->get_document() ) ) { + $TainacanMedia->index_pdf_content( null, $item->get_ID() ); + } elseif ( $item->get_document_type() == 'attachment' ) { + if (! wp_attachment_is_image( $item->get_document() ) ) { + $filepath = get_attached_file( $item->get_document() ); + $TainacanMedia->index_pdf_content( $filepath, $item->get_ID() ); + } + } + return true; + } /** * Get a default thumbnail ID from the item document. @@ -452,13 +471,12 @@ class Items extends Repository { ) { $thumb_id = $this->get_thumbnail_id_from_document( $updated_item ); - if ( ! is_null( $thumb_id ) ) { set_post_thumbnail( $updated_item->get_id(), (int) $thumb_id ); } } - + $this->generate_index_content( $updated_item ); } /** From f76c706896481a1c68c7b443a8b3b2d626a289a7 Mon Sep 17 00:00:00 2001 From: vnmedeiros Date: Tue, 10 Sep 2019 13:14:28 -0300 Subject: [PATCH 04/12] add option to advanced search indexed content documents #245 --- src/admin/components/advanced-search/advanced-search.vue | 3 ++- src/classes/class-tainacan-media.php | 9 ++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/admin/components/advanced-search/advanced-search.vue b/src/admin/components/advanced-search/advanced-search.vue index ecd72a979..7526288af 100644 --- a/src/admin/components/advanced-search/advanced-search.vue +++ b/src/admin/components/advanced-search/advanced-search.vue @@ -54,6 +54,7 @@ :value="`${metadatum.id}-${metadatum.metadata_type_options.taxonomy}-${metadatum.metadata_type_object.primitive_type}`" :key="metadatum.id" >{{ metadatum.name }} + @@ -533,7 +534,7 @@ } else { this.advancedSearchQuery.metaquery = Object.assign({}, this.advancedSearchQuery.metaquery, { [`${searchCriterion}`]: { - key: Number(criteriaKey[0]), + key: criteriaKey[0], compare: '=', originalMeta: value, } diff --git a/src/classes/class-tainacan-media.php b/src/classes/class-tainacan-media.php index 584a93302..aac90529d 100644 --- a/src/classes/class-tainacan-media.php +++ b/src/classes/class-tainacan-media.php @@ -230,7 +230,7 @@ class Media { public function index_pdf_content($file, $item_id) { - $content_index_meta = '_pdf_content_index'; + $content_index_meta = '_document_content_index'; if (defined('TAINACAN_CONTENT_PDF_INDEX_METADATA')) { $content_index_meta = TAINACAN_CONTENT_PDF_INDEX_METADATA; } @@ -253,14 +253,13 @@ class Media { if ( ! \is_null($alternate) ) { return $alternate; } - - $PDF2Text = new \PDF2Text(); $PDF2Text->setFilename($file); try { - $PDF2Text->decodePDF(); - $content = preg_replace('/[^a-zA-Z0-9_ -]/s','',$PDF2Text->output()); // melhorar essa expresão regular + $PDF2Text->decodePDF(); + //$content = $PDF2Text->output(); // melhorar essa expresão regular + $content = preg_replace('/[^a-zA-Z0-9_ -]/s','',$PDF2Text->output()); // melhorar essa expresão regular //$content = filter_var ( $PDF2Text->output(), FILTER_SANITIZE_STRING); //$content = iconv('ISO-8859-1', 'UTF-8//TRANSLIT//IGNORE', $PDF2Text->output()); //$content = preg_replace('/[\r\n\\n]+/', "\n", $content); From 912dc51585ff134f4c94099ff45986a6591871f4 Mon Sep 17 00:00:00 2001 From: vnmedeiros Date: Tue, 10 Sep 2019 16:05:07 -0300 Subject: [PATCH 05/12] fix encoding and regular expression #245 --- src/classes/class-tainacan-media.php | 102 +++++++++++++-------------- 1 file changed, 50 insertions(+), 52 deletions(-) diff --git a/src/classes/class-tainacan-media.php b/src/classes/class-tainacan-media.php index aac90529d..2412b4ede 100644 --- a/src/classes/class-tainacan-media.php +++ b/src/classes/class-tainacan-media.php @@ -10,13 +10,13 @@ class Media { private static $file_handle = null; private static $file_name = null; - public static function get_instance() { - if(!isset(self::$instance)) { - self::$instance = new self(); - } + public static function get_instance() { + if(!isset(self::$instance)) { + self::$instance = new self(); + } - return self::$instance; - } + return self::$instance; + } /** * Insert an attachment from an URL address. @@ -59,58 +59,58 @@ class Media { } - /** - * Avoid memory overflow problems with large files (Exceeded maximum memory limit of PHP) - * - * @param $url - * @return string the file path - */ - public function save_remote_file($url) { + /** + * Avoid memory overflow problems with large files (Exceeded maximum memory limit of PHP) + * + * @param $url + * @return string the file path + */ + public function save_remote_file($url) { - set_time_limit(0); + set_time_limit(0); - $filename = tempnam(sys_get_temp_dir(), basename($url)); + $filename = tempnam(sys_get_temp_dir(), basename($url)); - # Open the file for writing... - self::$file_handle = fopen($filename, 'w+'); - self::$file_name = $filename; + # Open the file for writing... + self::$file_handle = fopen($filename, 'w+'); + self::$file_name = $filename; - $callback = function ($ch, $str) { - $len = fwrite(self::$file_handle, $str); - return $len; - }; + $callback = function ($ch, $str) { + $len = fwrite(self::$file_handle, $str); + return $len; + }; - $ch = curl_init(); - curl_setopt($ch, CURLOPT_URL, $url); - curl_setopt($ch, CURLOPT_FILE, self::$file_handle); - curl_setopt($ch, CURLOPT_HEADER, 0); - curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); - curl_setopt($ch, CURLOPT_BINARYTRANSFER, true); - curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); # optional - curl_setopt($ch, CURLOPT_TIMEOUT, -1); # optional: -1 = unlimited, 3600 = 1 hour - curl_setopt($ch, CURLOPT_VERBOSE, false); # Set to true to see all the innards + $ch = curl_init(); + curl_setopt($ch, CURLOPT_URL, $url); + curl_setopt($ch, CURLOPT_FILE, self::$file_handle); + curl_setopt($ch, CURLOPT_HEADER, 0); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_setopt($ch, CURLOPT_BINARYTRANSFER, true); + curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); # optional + curl_setopt($ch, CURLOPT_TIMEOUT, -1); # optional: -1 = unlimited, 3600 = 1 hour + curl_setopt($ch, CURLOPT_VERBOSE, false); # Set to true to see all the innards - # Only if you need to bypass SSL certificate validation - curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); - curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); + # Only if you need to bypass SSL certificate validation + curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); + curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); - # Assign a callback function to the CURL Write-Function - curl_setopt($ch, CURLOPT_WRITEFUNCTION, $callback); + # Assign a callback function to the CURL Write-Function + curl_setopt($ch, CURLOPT_WRITEFUNCTION, $callback); - # Exceute the download - note we DO NOT put the result into a variable! - curl_exec($ch); + # Exceute the download - note we DO NOT put the result into a variable! + curl_exec($ch); - # Close CURL - curl_close($ch); + # Close CURL + curl_close($ch); - # Close the file pointer - fclose(self::$file_handle); + # Close the file pointer + fclose(self::$file_handle); - return $filename; - } + return $filename; + } - /** + /** * Insert an attachment from an URL address. * * @param blob $blob bitstream of the attachment @@ -230,7 +230,7 @@ class Media { public function index_pdf_content($file, $item_id) { - $content_index_meta = '_document_content_index'; + $content_index_meta = '_document_content_index'; if (defined('TAINACAN_CONTENT_PDF_INDEX_METADATA')) { $content_index_meta = TAINACAN_CONTENT_PDF_INDEX_METADATA; } @@ -255,14 +255,12 @@ class Media { } $PDF2Text = new \PDF2Text(); + $PDF2Text->setUnicode(true); $PDF2Text->setFilename($file); try { - $PDF2Text->decodePDF(); - //$content = $PDF2Text->output(); // melhorar essa expresão regular - $content = preg_replace('/[^a-zA-Z0-9_ -]/s','',$PDF2Text->output()); // melhorar essa expresão regular - //$content = filter_var ( $PDF2Text->output(), FILTER_SANITIZE_STRING); - //$content = iconv('ISO-8859-1', 'UTF-8//TRANSLIT//IGNORE', $PDF2Text->output()); - //$content = preg_replace('/[\r\n\\n]+/', "\n", $content); + $PDF2Text->decodePDF(); + $content = preg_replace('~[[:cntrl:]]~', '', $PDF2Text->output()); + $content = mb_convert_encoding($content, 'UTF-8', 'ISO-8859-1'); $meta_id = update_post_meta( $item_id, $content_index_meta, $content ); } catch(Exception $e) { error_log('Caught exception: ' . $e->getMessage() . "\n"); From 3cc9172d597af28cf9a513166ceb415198a29bd4 Mon Sep 17 00:00:00 2001 From: vnmedeiros Date: Tue, 10 Sep 2019 16:27:31 -0300 Subject: [PATCH 06/12] remove the Number constructor in frontend #245 --- src/admin/components/advanced-search/advanced-search.vue | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/admin/components/advanced-search/advanced-search.vue b/src/admin/components/advanced-search/advanced-search.vue index 7526288af..76c22f74c 100644 --- a/src/admin/components/advanced-search/advanced-search.vue +++ b/src/admin/components/advanced-search/advanced-search.vue @@ -526,7 +526,7 @@ if(criteriaKey[2] != 'date' && criteriaKey[2] != 'int' && criteriaKey[2] != 'float'){ this.advancedSearchQuery.metaquery = Object.assign({}, this.advancedSearchQuery.metaquery, { [`${searchCriterion}`]: { - key: Number(criteriaKey[0]), + key: criteriaKey[0], compare: 'LIKE', originalMeta: value, } @@ -534,7 +534,7 @@ } else { this.advancedSearchQuery.metaquery = Object.assign({}, this.advancedSearchQuery.metaquery, { [`${searchCriterion}`]: { - key: criteriaKey[0], + key: Number(criteriaKey[0]), compare: '=', originalMeta: value, } From 005a304edbbe220baaf166944ec777bf66168a27 Mon Sep 17 00:00:00 2001 From: vnmedeiros Date: Wed, 11 Sep 2019 15:32:08 -0300 Subject: [PATCH 07/12] index document content using the wp-cli #245 --- src/cli/class-tainacan-cli-document.php | 119 ++++++++++++++++++++++++ src/cli/class-tainacan-cli.php | 3 +- 2 files changed, 121 insertions(+), 1 deletion(-) create mode 100644 src/cli/class-tainacan-cli-document.php diff --git a/src/cli/class-tainacan-cli-document.php b/src/cli/class-tainacan-cli-document.php new file mode 100644 index 000000000..0e930446e --- /dev/null +++ b/src/cli/class-tainacan-cli-document.php @@ -0,0 +1,119 @@ +items_repository = Repositories\Items::get_instance(); + $this->collection_repository = Repositories\Collections::get_instance(); + $this->result_count = ['indexed_documents' => 0]; + } + + /** + * index content of documents + * + * ## OPTIONS + * [--collection-id=] + * : Specific ID of the collection into which the document content of the items will be indexed, if not informed all collections will be index. + * + * + * [--dry-run] + * : only count the total of item which will index, just output a report + * + * ## EXAMPLES + * + * wp tainacan index-content --collection-id=416 + * indexing documents of items to collection 416: 100% [====================================================] 0:00 / 0:00 + * Success: + * 7 items indexed + * + * + * wp tainacan index-content + * indexing documents of items to collection 416: 100% [====================================================] 0:00 / 0:00 + * Success: + * 7 items indexed + * indexing documents of items to collection 301: 100% [====================================================] 0:00 / 0:00 + * Success: + * 10 items indexed + * + */ + public function __invoke($args, $assoc_args) { + $this->dry_run = false; + if ( !empty($assoc_args['dry-run']) ) { + $this->dry_run = true; + } + + if( empty($assoc_args['collection-id']) ) { + $this->index_item_all_collections(); + } else { + $collection_id = $assoc_args['collection-id']; + $this->index_item($collection_id); + } + } + + private function index_item_all_collections() { + $collections = $this->collection_repository->fetch(['posts_per_page'=>-1], 'OBJECT'); + foreach ($collections as $collection) { + $this->result_count['indexed_documents'] = 0; + $this->index_item($collection->get_id()); + } + } + + private function index_item($collection_id) { + $per_page = 50; $page = 1; + $args = [ + 'posts_per_page'=> $per_page, + 'paged' => $page, + 'post_status' => get_post_stati() + ]; + $collection_items = $this->items_repository->fetch($args, $collection_id, 'WP_Query'); + $total = $collection_items->found_posts; + $last_page = ceil($total/$per_page); + + $progress = \WP_CLI\Utils\make_progress_bar( "indexing documents of items to collection $collection_id:", $total ); + while ($page++ <= $last_page) { + if ($collection_items->have_posts()) { + while ( $collection_items->have_posts() ) { + $collection_items->the_post(); + $item = new Entities\Item($collection_items->post); + $this->index_content_document_item($item); + $progress->tick(); + } + } + $args['paged'] = $page; + $collection_items = $this->items_repository->fetch($args, $collection_id, 'WP_Query'); + } + $progress->finish(); + + $msg = "\n" . $this->result_count['indexed_documents'] . " items indexed"; + + \WP_CLI::success( $msg ); + } + + private function index_content_document_item($item) { + if (! $item instanceof Entities\Item) { + \WP_CLI::error( 'An item with this ID was not found', true ); + } + + if ( empty( $item->get_document() ) ) { + return null; + } + + $this->result_count['indexed_documents']++; + if ($this->dry_run) + return true; + return $this->items_repository->generate_index_content($item); + } +} + + + ?> \ No newline at end of file diff --git a/src/cli/class-tainacan-cli.php b/src/cli/class-tainacan-cli.php index a391abda5..16409f6ce 100644 --- a/src/cli/class-tainacan-cli.php +++ b/src/cli/class-tainacan-cli.php @@ -28,7 +28,8 @@ class Cli { \WP_CLI::add_command('tainacan garbage-collector', 'Tainacan\Cli_Garbage_Collector'); \WP_CLI::add_command('tainacan move-attachments-to-items-folder', 'Tainacan\Cli_Move_Attachments'); - \WP_CLI::add_command('tainacan collection', 'Tainacan\Cli_Collection'); + \WP_CLI::add_command('tainacan collection', 'Tainacan\Cli_Collection'); + \WP_CLI::add_command('tainacan index-content', 'Tainacan\Cli_Document'); } From dbec21d502a8e06ab9041d2fe642a85381511e7e Mon Sep 17 00:00:00 2001 From: vnmedeiros Date: Wed, 11 Sep 2019 23:53:47 -0300 Subject: [PATCH 08/12] improvements on detect encode and WP-Cli #245 --- src/classes/class-tainacan-media.php | 6 +++++- src/cli/class-tainacan-cli-document.php | 18 +++++++++++------- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/src/classes/class-tainacan-media.php b/src/classes/class-tainacan-media.php index 2412b4ede..730ca9e7f 100644 --- a/src/classes/class-tainacan-media.php +++ b/src/classes/class-tainacan-media.php @@ -260,7 +260,11 @@ class Media { try { $PDF2Text->decodePDF(); $content = preg_replace('~[[:cntrl:]]~', '', $PDF2Text->output()); - $content = mb_convert_encoding($content, 'UTF-8', 'ISO-8859-1'); + + $wp_charset = get_bloginfo('charset'); + $content_charset = mb_detect_encoding($content); + $content = mb_convert_encoding($content, $wp_charset, $content_charset); + $meta_id = update_post_meta( $item_id, $content_index_meta, $content ); } catch(Exception $e) { error_log('Caught exception: ' . $e->getMessage() . "\n"); diff --git a/src/cli/class-tainacan-cli-document.php b/src/cli/class-tainacan-cli-document.php index 0e930446e..fe7256040 100644 --- a/src/cli/class-tainacan-cli-document.php +++ b/src/cli/class-tainacan-cli-document.php @@ -22,8 +22,8 @@ class Cli_Document { * index content of documents * * ## OPTIONS - * [--collection-id=] - * : Specific ID of the collection into which the document content of the items will be indexed, if not informed all collections will be index. + * [--collection=] + * : Specific ID of the collection into which the document content of the items will be indexed, or 'all' to all collections. * * * [--dry-run] @@ -31,13 +31,13 @@ class Cli_Document { * * ## EXAMPLES * - * wp tainacan index-content --collection-id=416 + * wp tainacan index-content --collection=416 * indexing documents of items to collection 416: 100% [====================================================] 0:00 / 0:00 * Success: * 7 items indexed * * - * wp tainacan index-content + * wp tainacan index-content --collection=all * indexing documents of items to collection 416: 100% [====================================================] 0:00 / 0:00 * Success: * 7 items indexed @@ -52,11 +52,15 @@ class Cli_Document { $this->dry_run = true; } - if( empty($assoc_args['collection-id']) ) { + if( empty($assoc_args['collection']) ) { + \WP_CLI::error( 'Wrong parameters', true ); + } + + $collection = $assoc_args['collection']; + if ($collection == 'all') { $this->index_item_all_collections(); } else { - $collection_id = $assoc_args['collection-id']; - $this->index_item($collection_id); + $this->index_item($collection); } } From fc7bb5250d160184d0068518ac5912c4deede47c Mon Sep 17 00:00:00 2001 From: vnmedeiros Date: Wed, 18 Sep 2019 17:48:46 -0300 Subject: [PATCH 09/12] add the lib "PDF Parser" and change to use it --- composer.json | 3 +- src/classes/class-tainacan-media.php | 11 +- src/classes/libs/class-pdf2text.php | 377 --------------------------- src/classes/tainacan-creator.php | 1 - 4 files changed, 6 insertions(+), 386 deletions(-) delete mode 100644 src/classes/libs/class-pdf2text.php diff --git a/composer.json b/composer.json index ae246dec5..d03574e01 100644 --- a/composer.json +++ b/composer.json @@ -3,7 +3,8 @@ "description": "Transforme seu site wordpress em um repositório digital.", "type": "wordpress-plugin", "require": { - "respect/validation": "^1.1" + "respect/validation": "^1.1", + "smalot/pdfparser": "*" }, "require-dev": { "squizlabs/php_codesniffer": "^2.2 || ^3.0.2", diff --git a/src/classes/class-tainacan-media.php b/src/classes/class-tainacan-media.php index 730ca9e7f..0e8ad8c3d 100644 --- a/src/classes/class-tainacan-media.php +++ b/src/classes/class-tainacan-media.php @@ -229,7 +229,7 @@ class Media { } public function index_pdf_content($file, $item_id) { - + $content_index_meta = '_document_content_index'; if (defined('TAINACAN_CONTENT_PDF_INDEX_METADATA')) { $content_index_meta = TAINACAN_CONTENT_PDF_INDEX_METADATA; @@ -239,7 +239,7 @@ class Media { $meta_id = update_post_meta( $item_id, $content_index_meta, null ); return true; } - + if ( ! \file_exists($file) ) { return false; } @@ -254,12 +254,9 @@ class Media { return $alternate; } - $PDF2Text = new \PDF2Text(); - $PDF2Text->setUnicode(true); - $PDF2Text->setFilename($file); try { - $PDF2Text->decodePDF(); - $content = preg_replace('~[[:cntrl:]]~', '', $PDF2Text->output()); + $parser = new \Smalot\PdfParser\Parser(); + $content = $parser->parseFile($file)->getText(); $wp_charset = get_bloginfo('charset'); $content_charset = mb_detect_encoding($content); diff --git a/src/classes/libs/class-pdf2text.php b/src/classes/libs/class-pdf2text.php deleted file mode 100644 index c46b51691..000000000 --- a/src/classes/libs/class-pdf2text.php +++ /dev/null @@ -1,377 +0,0 @@ -setFilename('test.pdf'); -$a->decodePDF(); -echo $a->output(); - -ALTERNATIVES: -Other excellent options to search within a PDF: -- Apache PDFbox (http://pdfbox.apache.org/). An open source Java solution -- pdflib TET (http://www.pdflib.com/products/tet/) -- Online converter: http://snowtide.com/PDFTextStream -*/ - - -class PDF2Text { - // Some settings - var $multibyte = 2; // Use setUnicode(TRUE|FALSE) - var $convertquotes = ENT_QUOTES; // ENT_COMPAT (double-quotes), ENT_QUOTES (Both), ENT_NOQUOTES (None) - - // Variables - var $filename = ''; - var $decodedtext = ''; - - function setFilename($filename) { - // Reset - $this->decodedtext = ''; - $this->filename = $filename; - } - - function output($echo = false) { - if($echo) echo $this->decodedtext; - else return $this->decodedtext; - } - - function setUnicode($input) { - // 4 for unicode. But 2 should work in most cases just fine - if($input == true) $this->multibyte = 4; - else $this->multibyte = 2; - } - - function decodePDF() { - // Read the data from pdf file - $infile = @file_get_contents($this->filename, FILE_BINARY); - if (empty($infile)) - return ""; - - // Get all text data. - $transformations = array(); - $texts = array(); - - // Get the list of all objects. - preg_match_all("#obj[\n|\r](.*)endobj[\n|\r]#ismU", $infile, $objects); - $objects = @$objects[1]; - - // Select objects with streams. - for ($i = 0; $i < count($objects); $i++) { - $currentObject = $objects[$i]; - - // Check if an object includes data stream. - if (preg_match("#stream[\n|\r](.*)endstream[\n|\r]#ismU", $currentObject, $stream)) { - $stream = ltrim($stream[1]); - - // Check object parameters and look for text data. - $options = $this->getObjectOptions($currentObject); - - if (!(empty($options["Length1"]) && empty($options["Type"]) && empty($options["Subtype"]))) - continue; - - // Hack, length doesnt always seem to be correct - unset($options["Length"]); - - // So, we have text data. Decode it. - $data = $this->getDecodedStream($stream, $options); - - if (strlen($data)) { - if (preg_match_all("#BT[\n|\r](.*)ET[\n|\r]#ismU", $data, $textContainers)) { - $textContainers = @$textContainers[1]; - $this->getDirtyTexts($texts, $textContainers); - } else - $this->getCharTransformations($transformations, $data); - } - } - } - - // Analyze text blocks taking into account character transformations and return results. - $this->decodedtext = $this->getTextUsingTransformations($texts, $transformations); - } - - - function decodeAsciiHex($input) { - $output = ""; - - $isOdd = true; - $isComment = false; - - for($i = 0, $codeHigh = -1; $i < strlen($input) && $input[$i] != '>'; $i++) { - $c = $input[$i]; - - if($isComment) { - if ($c == '\r' || $c == '\n') - $isComment = false; - continue; - } - - switch($c) { - case '\0': case '\t': case '\r': case '\f': case '\n': case ' ': break; - case '%': - $isComment = true; - break; - - default: - $code = hexdec($c); - if($code === 0 && $c != '0') - return ""; - - if($isOdd) - $codeHigh = $code; - else - $output .= chr($codeHigh * 16 + $code); - - $isOdd = !$isOdd; - break; - } - } - - if($input[$i] != '>') - return ""; - - if($isOdd) - $output .= chr($codeHigh * 16); - - return $output; - } - - function decodeAscii85($input) { - $output = ""; - - $isComment = false; - $ords = array(); - - for($i = 0, $state = 0; $i < strlen($input) && $input[$i] != '~'; $i++) { - $c = $input[$i]; - - if($isComment) { - if ($c == '\r' || $c == '\n') - $isComment = false; - continue; - } - - if ($c == '\0' || $c == '\t' || $c == '\r' || $c == '\f' || $c == '\n' || $c == ' ') - continue; - if ($c == '%') { - $isComment = true; - continue; - } - if ($c == 'z' && $state === 0) { - $output .= str_repeat(chr(0), 4); - continue; - } - if ($c < '!' || $c > 'u') - return ""; - - $code = ord($input[$i]) & 0xff; - $ords[$state++] = $code - ord('!'); - - if ($state == 5) { - $state = 0; - for ($sum = 0, $j = 0; $j < 5; $j++) - $sum = $sum * 85 + $ords[$j]; - for ($j = 3; $j >= 0; $j--) - $output .= chr($sum >> ($j * 8)); - } - } - if ($state === 1) - return ""; - elseif ($state > 1) { - for ($i = 0, $sum = 0; $i < $state; $i++) - $sum += ($ords[$i] + ($i == $state - 1)) * pow(85, 4 - $i); - for ($i = 0; $i < $state - 1; $i++) - $ouput .= chr($sum >> ((3 - $i) * 8)); - } - - return $output; - } - - function decodeFlate($input) { - return gzuncompress($input); - } - - function getObjectOptions($object) { - $options = array(); - - if (preg_match("#<<(.*)>>#ismU", $object, $options)) { - $options = explode("/", $options[1]); - @array_shift($options); - - $o = array(); - for ($j = 0; $j < @count($options); $j++) { - $options[$j] = preg_replace("#\s+#", " ", trim($options[$j])); - if (strpos($options[$j], " ") !== false) { - $parts = explode(" ", $options[$j]); - $o[$parts[0]] = $parts[1]; - } else - $o[$options[$j]] = true; - } - $options = $o; - unset($o); - } - - return $options; - } - - function getDecodedStream($stream, $options) { - $data = ""; - if (empty($options["Filter"])) - $data = $stream; - else { - $length = !empty($options["Length"]) ? $options["Length"] : strlen($stream); - $_stream = substr($stream, 0, $length); - - foreach ($options as $key => $value) { - if ($key == "ASCIIHexDecode") - $_stream = $this->decodeAsciiHex($_stream); - if ($key == "ASCII85Decode") - $_stream = $this->decodeAscii85($_stream); - if ($key == "FlateDecode") - $_stream = $this->decodeFlate($_stream); - if ($key == "Crypt") { // TO DO - } - } - $data = $_stream; - } - return $data; - } - function getDirtyTexts(&$texts, $textContainers) { - - for ($j = 0; $j < count($textContainers); $j++) { - if (preg_match_all("#\[(.*)\]\s*TJ[\n|\r]#ismU", $textContainers[$j], $parts)) - $texts = array_merge($texts, @$parts[1]); - elseif(preg_match_all("#T[d|w|m|f]\s*(\(.*\))\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts)) - $texts = array_merge($texts, @$parts[1]); - elseif(preg_match_all("#T[d|w|m|f]\s*(\[.*\])\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts)) - $texts = array_merge($texts, @$parts[1]); - } - } - function getCharTransformations(&$transformations, $stream) { - preg_match_all("#([0-9]+)\s+beginbfchar(.*)endbfchar#ismU", $stream, $chars, PREG_SET_ORDER); - preg_match_all("#([0-9]+)\s+beginbfrange(.*)endbfrange#ismU", $stream, $ranges, PREG_SET_ORDER); - - for ($j = 0; $j < count($chars); $j++) { - $count = $chars[$j][1]; - $current = explode("\n", trim($chars[$j][2])); - for ($k = 0; $k < $count && $k < count($current); $k++) { - if (preg_match("#<([0-9a-f]{2,4})>\s+<([0-9a-f]{4,512})>#is", trim($current[$k]), $map)) - $transformations[str_pad($map[1], 4, "0")] = $map[2]; - } - } - for ($j = 0; $j < count($ranges); $j++) { - $count = $ranges[$j][1]; - $current = explode("\n", trim($ranges[$j][2])); - for ($k = 0; $k < $count && $k < count($current); $k++) { - if (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+<([0-9a-f]{4})>#is", trim($current[$k]), $map)) { - $from = hexdec($map[1]); - $to = hexdec($map[2]); - $_from = hexdec($map[3]); - - for ($m = $from, $n = 0; $m <= $to; $m++, $n++) - $transformations[sprintf("%04X", $m)] = sprintf("%04X", $_from + $n); - } elseif (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+\[(.*)\]#ismU", trim($current[$k]), $map)) { - $from = hexdec($map[1]); - $to = hexdec($map[2]); - $parts = preg_split("#\s+#", trim($map[3])); - - for ($m = $from, $n = 0; $m <= $to && $n < count($parts); $m++, $n++) - $transformations[sprintf("%04X", $m)] = sprintf("%04X", hexdec($parts[$n])); - } - } - } - } - function getTextUsingTransformations($texts, $transformations) { - $document = ""; - for ($i = 0; $i < count($texts); $i++) { - $isHex = false; - $isPlain = false; - - $hex = ""; - $plain = ""; - for ($j = 0; $j < strlen($texts[$i]); $j++) { - $c = $texts[$i][$j]; - switch($c) { - case "<": - $hex = ""; - $isHex = true; - break; - case ">": - $hexs = str_split($hex, $this->multibyte); // 2 or 4 (UTF8 or ISO) - for ($k = 0; $k < count($hexs); $k++) { - $chex = str_pad($hexs[$k], 4, "0"); // Add tailing zero - if (isset($transformations[$chex])) - $chex = $transformations[$chex]; - $document .= html_entity_decode("&#x".$chex.";"); - } - $isHex = false; - break; - case "(": - $plain = ""; - $isPlain = true; - break; - case ")": - $document .= $plain; - $isPlain = false; - break; - case "\\": - $c2 = $texts[$i][$j + 1]; - if (in_array($c2, array("\\", "(", ")"))) $plain .= $c2; - elseif ($c2 == "n") $plain .= '\n'; - elseif ($c2 == "r") $plain .= '\r'; - elseif ($c2 == "t") $plain .= '\t'; - elseif ($c2 == "b") $plain .= '\b'; - elseif ($c2 == "f") $plain .= '\f'; - elseif ($c2 >= '0' && $c2 <= '9') { - $oct = preg_replace("#[^0-9]#", "", substr($texts[$i], $j + 1, 3)); - $j += strlen($oct) - 1; - $plain .= html_entity_decode("&#".octdec($oct).";", $this->convertquotes); - } - $j++; - break; - - default: - if ($isHex) - $hex .= $c; - if ($isPlain) - $plain .= $c; - break; - } - } - $document .= "\n"; - } - - return $document; - } -} -?> \ No newline at end of file diff --git a/src/classes/tainacan-creator.php b/src/classes/tainacan-creator.php index 5c513b00f..964f38ee8 100644 --- a/src/classes/tainacan-creator.php +++ b/src/classes/tainacan-creator.php @@ -34,7 +34,6 @@ const DIRS = [ require_once('libs/wp-async-request.php'); require_once('libs/wp-background-process.php'); -require_once('libs/class-pdf2text.php'); require_once('class-tainacan-background-process.php'); require_once('tainacan-utils.php'); require_once(TAINACAN_IMPORTER_DIR . 'class-tainacan-bg-importer.php'); From 37eb2717cd9c1cae8d20b3a1e8412c0cbc266eb1 Mon Sep 17 00:00:00 2001 From: vnmedeiros Date: Thu, 19 Sep 2019 16:14:50 -0300 Subject: [PATCH 10/12] add composer.lock #245 --- composer.lock | 199 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 159 insertions(+), 40 deletions(-) diff --git a/composer.lock b/composer.lock index 7b39bdf8f..55daf5138 100644 --- a/composer.lock +++ b/composer.lock @@ -4,20 +4,20 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "b5a7a81dd7eed9bc122ee36e472c6430", + "content-hash": "e2a614836d1857e45c4f9be57073c2b2", "packages": [ { "name": "respect/validation", - "version": "1.1.16", + "version": "1.1.31", "source": { "type": "git", "url": "https://github.com/Respect/Validation.git", - "reference": "020ea1ebb5dc626bb7f1958ff49f69172ff589cc" + "reference": "45d109fc830644fecc1145200d6351ce4f2769d0" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/Respect/Validation/zipball/020ea1ebb5dc626bb7f1958ff49f69172ff589cc", - "reference": "020ea1ebb5dc626bb7f1958ff49f69172ff589cc", + "url": "https://api.github.com/repos/Respect/Validation/zipball/45d109fc830644fecc1145200d6351ce4f2769d0", + "reference": "45d109fc830644fecc1145200d6351ce4f2769d0", "shasum": "" }, "require": { @@ -25,9 +25,9 @@ "symfony/polyfill-mbstring": "^1.2" }, "require-dev": { - "egulias/email-validator": "~1.2", + "egulias/email-validator": "~1.2 || ~2.1", "mikey179/vfsstream": "^1.5", - "phpunit/phpunit": "~4.0", + "phpunit/phpunit": "~4.0 || ~5.0", "symfony/validator": "~2.6.9", "zendframework/zend-validator": "~2.3" }, @@ -35,7 +35,7 @@ "egulias/email-validator": "Strict (RFC compliant) email validation", "ext-bcmath": "Arbitrary Precision Mathematics", "ext-mbstring": "Multibyte String Functions", - "fabpot/php-cs-fixer": "Fix PSR2 and other coding style issues", + "friendsofphp/php-cs-fixer": "Fix PSR2 and other coding style issues", "symfony/validator": "Use Symfony validator through Respect\\Validation", "zendframework/zend-validator": "Use Zend Framework validator through Respect\\Validation" }, @@ -52,7 +52,7 @@ }, "notification-url": "https://packagist.org/downloads/", "license": [ - "BSD Style" + "BSD-3-Clause" ], "authors": [ { @@ -67,20 +67,70 @@ "validation", "validator" ], - "time": "2018-05-19T14:26:44+00:00" + "time": "2019-05-28T06:10:06+00:00" }, { - "name": "symfony/polyfill-mbstring", - "version": "v1.8.0", + "name": "smalot/pdfparser", + "version": "v0.14.0", "source": { "type": "git", - "url": "https://github.com/symfony/polyfill-mbstring.git", - "reference": "3296adf6a6454a050679cde90f95350ad604b171" + "url": "https://github.com/smalot/pdfparser.git", + "reference": "ec72a99028ba5e21a0acad92047b85e128cbf81f" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/symfony/polyfill-mbstring/zipball/3296adf6a6454a050679cde90f95350ad604b171", - "reference": "3296adf6a6454a050679cde90f95350ad604b171", + "url": "https://api.github.com/repos/smalot/pdfparser/zipball/ec72a99028ba5e21a0acad92047b85e128cbf81f", + "reference": "ec72a99028ba5e21a0acad92047b85e128cbf81f", + "shasum": "" + }, + "require": { + "ext-mbstring": "*", + "ext-zlib": "*", + "php": ">=5.3.0", + "tecnickcom/tcpdf": "~6.0" + }, + "require-dev": { + "atoum/atoum": "^2.8 | ^3.0" + }, + "type": "library", + "autoload": { + "psr-0": { + "Smalot\\PdfParser\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "LGPL-3.0" + ], + "authors": [ + { + "name": "Sebastien Malot", + "email": "sebastien@malot.fr" + } + ], + "description": "Pdf parser library. Can read and extract information from pdf file.", + "homepage": "http://www.pdfparser.org", + "keywords": [ + "extract", + "parse", + "parser", + "pdf", + "text" + ], + "time": "2019-01-23T09:14:37+00:00" + }, + { + "name": "symfony/polyfill-mbstring", + "version": "v1.12.0", + "source": { + "type": "git", + "url": "https://github.com/symfony/polyfill-mbstring.git", + "reference": "b42a2f66e8f1b15ccf25652c3424265923eb4f17" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/polyfill-mbstring/zipball/b42a2f66e8f1b15ccf25652c3424265923eb4f17", + "reference": "b42a2f66e8f1b15ccf25652c3424265923eb4f17", "shasum": "" }, "require": { @@ -92,7 +142,7 @@ "type": "library", "extra": { "branch-alias": { - "dev-master": "1.8-dev" + "dev-master": "1.12-dev" } }, "autoload": { @@ -126,22 +176,84 @@ "portable", "shim" ], - "time": "2018-04-26T10:06:28+00:00" + "time": "2019-08-06T08:03:45+00:00" + }, + { + "name": "tecnickcom/tcpdf", + "version": "6.2.26", + "source": { + "type": "git", + "url": "https://github.com/tecnickcom/TCPDF.git", + "reference": "367241059ca166e3a76490f4448c284e0a161f15" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/tecnickcom/TCPDF/zipball/367241059ca166e3a76490f4448c284e0a161f15", + "reference": "367241059ca166e3a76490f4448c284e0a161f15", + "shasum": "" + }, + "require": { + "php": ">=5.3.0" + }, + "type": "library", + "autoload": { + "classmap": [ + "config", + "include", + "tcpdf.php", + "tcpdf_parser.php", + "tcpdf_import.php", + "tcpdf_barcodes_1d.php", + "tcpdf_barcodes_2d.php", + "include/tcpdf_colors.php", + "include/tcpdf_filters.php", + "include/tcpdf_font_data.php", + "include/tcpdf_fonts.php", + "include/tcpdf_images.php", + "include/tcpdf_static.php", + "include/barcodes/datamatrix.php", + "include/barcodes/pdf417.php", + "include/barcodes/qrcode.php" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "LGPL-3.0" + ], + "authors": [ + { + "name": "Nicola Asuni", + "email": "info@tecnick.com", + "role": "lead" + } + ], + "description": "TCPDF is a PHP class for generating PDF documents and barcodes.", + "homepage": "http://www.tcpdf.org/", + "keywords": [ + "PDFD32000-2008", + "TCPDF", + "barcodes", + "datamatrix", + "pdf", + "pdf417", + "qrcode" + ], + "time": "2018-10-16T17:24:05+00:00" } ], "packages-dev": [ { "name": "squizlabs/php_codesniffer", - "version": "3.2.3", + "version": "3.4.2", "source": { "type": "git", "url": "https://github.com/squizlabs/PHP_CodeSniffer.git", - "reference": "4842476c434e375f9d3182ff7b89059583aa8b27" + "reference": "b8a7362af1cc1aadb5bd36c3defc4dda2cf5f0a8" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/squizlabs/PHP_CodeSniffer/zipball/4842476c434e375f9d3182ff7b89059583aa8b27", - "reference": "4842476c434e375f9d3182ff7b89059583aa8b27", + "url": "https://api.github.com/repos/squizlabs/PHP_CodeSniffer/zipball/b8a7362af1cc1aadb5bd36c3defc4dda2cf5f0a8", + "reference": "b8a7362af1cc1aadb5bd36c3defc4dda2cf5f0a8", "shasum": "" }, "require": { @@ -174,64 +286,71 @@ } ], "description": "PHP_CodeSniffer tokenizes PHP, JavaScript and CSS files and detects violations of a defined set of coding standards.", - "homepage": "http://www.squizlabs.com/php-codesniffer", + "homepage": "https://github.com/squizlabs/PHP_CodeSniffer", "keywords": [ "phpcs", "standards" ], - "time": "2018-02-20T21:35:23+00:00" + "time": "2019-04-10T23:49:02+00:00" }, { "name": "wimg/php-compatibility", - "version": "8.1.0", + "version": "9.3.1", "source": { "type": "git", - "url": "https://github.com/wimg/PHPCompatibility.git", - "reference": "4ac01e4fe8faaa4f8d3b3cd06ea92e5418ce472e" + "url": "https://github.com/PHPCompatibility/PHPCompatibility.git", + "reference": "9999344e47e7af6b00e1a898eacc4e4368fb7196" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/wimg/PHPCompatibility/zipball/4ac01e4fe8faaa4f8d3b3cd06ea92e5418ce472e", - "reference": "4ac01e4fe8faaa4f8d3b3cd06ea92e5418ce472e", + "url": "https://api.github.com/repos/PHPCompatibility/PHPCompatibility/zipball/9999344e47e7af6b00e1a898eacc4e4368fb7196", + "reference": "9999344e47e7af6b00e1a898eacc4e4368fb7196", "shasum": "" }, "require": { "php": ">=5.3", - "squizlabs/php_codesniffer": "^2.2 || ^3.0.2" + "squizlabs/php_codesniffer": "^2.3 || ^3.0.2" }, "conflict": { "squizlabs/php_codesniffer": "2.6.2" }, "require-dev": { - "phpunit/phpunit": "^4.0 || ^5.0 || ^6.0" + "phpunit/phpunit": "~4.5 || ^5.0 || ^6.0 || ^7.0" }, "suggest": { - "dealerdirect/phpcodesniffer-composer-installer": "^0.4.3" + "dealerdirect/phpcodesniffer-composer-installer": "^0.5 || This Composer plugin will sort out the PHPCS 'installed_paths' automatically.", + "roave/security-advisories": "dev-master || Helps prevent installing dependencies with known security issues." }, "type": "phpcodesniffer-standard", - "autoload": { - "psr-4": { - "PHPCompatibility\\": "PHPCompatibility/" - } - }, "notification-url": "https://packagist.org/downloads/", "license": [ - "LGPL-3.0" + "LGPL-3.0-or-later" ], "authors": [ { "name": "Wim Godden", + "homepage": "https://github.com/wimg", "role": "lead" + }, + { + "name": "Juliette Reinders Folmer", + "homepage": "https://github.com/jrfnl", + "role": "lead" + }, + { + "name": "Contributors", + "homepage": "https://github.com/PHPCompatibility/PHPCompatibility/graphs/contributors" } ], - "description": "A set of sniffs for PHP_CodeSniffer that checks for PHP version compatibility.", + "description": "A set of sniffs for PHP_CodeSniffer that checks for PHP cross-version compatibility.", "homepage": "http://techblog.wimgodden.be/tag/codesniffer/", "keywords": [ "compatibility", "phpcs", "standards" ], - "time": "2017-12-27T21:58:38+00:00" + "abandoned": "phpcompatibility/php-compatibility", + "time": "2019-09-05T18:36:49+00:00" } ], "aliases": [], From c71e4b69b83cf246d8571b3f1719bcad2bf8cfc9 Mon Sep 17 00:00:00 2001 From: vnmedeiros Date: Thu, 19 Sep 2019 17:14:09 -0300 Subject: [PATCH 11/12] Checks if constant `TAINACAN_CONTENT_PDF_INDEX_ACTIVED` exists to active index pdf #245 --- src/classes/repositories/class-tainacan-items.php | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/classes/repositories/class-tainacan-items.php b/src/classes/repositories/class-tainacan-items.php index ba4291e45..5ee8ab8b9 100644 --- a/src/classes/repositories/class-tainacan-items.php +++ b/src/classes/repositories/class-tainacan-items.php @@ -476,7 +476,9 @@ class Items extends Repository { } } - $this->generate_index_content( $updated_item ); + if (defined('TAINACAN_CONTENT_PDF_INDEX_ACTIVED') && TAINACAN_CONTENT_PDF_INDEX_ACTIVED === true) { + $this->generate_index_content( $updated_item ); + } } /** From 4bdfb304e56c2ae80968da6b9c9aa98236cfb41c Mon Sep 17 00:00:00 2001 From: Leo Germani Date: Fri, 20 Sep 2019 09:10:00 -0300 Subject: [PATCH 12/12] Fix constant check to index pdf #245 --- src/classes/class-tainacan-media.php | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/classes/class-tainacan-media.php b/src/classes/class-tainacan-media.php index 5f769c00d..207f1cdb6 100644 --- a/src/classes/class-tainacan-media.php +++ b/src/classes/class-tainacan-media.php @@ -233,10 +233,11 @@ class Media { public function index_pdf_content($file, $item_id) { - $content_index_meta = '_document_content_index'; - if (defined('TAINACAN_CONTENT_PDF_INDEX_METADATA')) { - $content_index_meta = TAINACAN_CONTENT_PDF_INDEX_METADATA; + if ( ! defined('TAINACAN_INDEX_PDF_CONTENT') || true !== TAINACAN_INDEX_PDF_CONTENT ) { + return; } + + $content_index_meta = '_document_content_index'; if ($file == null) { $meta_id = update_post_meta( $item_id, $content_index_meta, null );