From 2e2b6f93f338481d68ac744f4250e5137049b58b Mon Sep 17 00:00:00 2001
From: leogermani <leogermani@gmail.com>
Date: Thu, 8 Aug 2019 11:51:51 -0300
Subject: [PATCH 01/12] starting implementation of index pdf contents using
 pdf2text class #245

---
 src/classes/class-tainacan-media.php |  26 ++
 src/classes/libs/class-pdf2text.php  | 377 +++++++++++++++++++++++++++
 2 files changed, 403 insertions(+)
 create mode 100644 src/classes/libs/class-pdf2text.php

diff --git a/src/classes/class-tainacan-media.php b/src/classes/class-tainacan-media.php
index e8679da87..b8fba368b 100644
--- a/src/classes/class-tainacan-media.php
+++ b/src/classes/class-tainacan-media.php
@@ -227,5 +227,31 @@ class Media {
 		if( $this->THROW_EXCPTION_ON_FATAL_ERROR ) 
 			throw new \Exception("fatal error");
 	}
+	
+	public index_pdf_content($file, $item_id) {
+		
+		if ( ! \file_exists($file) ) {
+			return false;
+		}
+		
+		// Allow plugins to implement other approach to index pdf contents 
+		$alternate = apply_filters('tainacan-index-pdf', null, $file, $item_id);
+		if ( ! \is_null($alternate) ) {
+			return $alternate;
+		}
+		
+		require_once( TAINACAN_CLASSES_DIR . '/lib/class-pdf2text.php' );
+		
+		$PDF2Text = new PDF2Text();
+		$PDF2Text->setFilename($file);
+		
+		try {
+			$PDF2Text->decodePDF();
+			update_post_meta( $item_id, '_pdf_index', $PDF2Text->output() );
+		} catch($e) {
+			return false;
+		}
+		
+	}
 		
 }
\ No newline at end of file
diff --git a/src/classes/libs/class-pdf2text.php b/src/classes/libs/class-pdf2text.php
new file mode 100644
index 000000000..c46b51691
--- /dev/null
+++ b/src/classes/libs/class-pdf2text.php
@@ -0,0 +1,377 @@
+<?php
+/*
+
+Source: https://gist.github.com/getive/78b62d42cebd79b5dfd6
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+This code is an improved version of what can be found at:
+http://www.webcheatsheet.com/php/reading_clean_text_from_pdf.php
+
+AUTHOR:
+- Webcheatsheet.com (Original code)
+- Joeri Stegeman (joeri210 [at] yahoo [dot] com) (Class conversion and fixes/adjustments)
+
+DESCRIPTION:
+This is a class to convert PDF files into ASCII text or so called PDF text extraction.
+It will ignore anything that is not addressed as text within the PDF and any layout.
+Currently supported filters are: ASCIIHexDecode, ASCII85Decode, FlateDecode
+
+PURPOSE(S):
+Most likely for people that want their PDF to be searchable.
+
+SYNTAX:
+include('class.pdf2text.php');
+$a = new PDF2Text();
+$a->setFilename('test.pdf');
+$a->decodePDF();
+echo $a->output(); 
+
+ALTERNATIVES:
+Other excellent options to search within a PDF:
+- Apache PDFbox (http://pdfbox.apache.org/). An open source Java solution
+- pdflib TET (http://www.pdflib.com/products/tet/)
+- Online converter: http://snowtide.com/PDFTextStream
+*/
+
+
+class PDF2Text {
+    // Some settings
+    var $multibyte = 2; // Use setUnicode(TRUE|FALSE)
+    var $convertquotes = ENT_QUOTES; // ENT_COMPAT (double-quotes), ENT_QUOTES (Both), ENT_NOQUOTES (None)
+    
+    // Variables
+    var $filename = '';
+    var $decodedtext = '';
+    
+    function setFilename($filename) { 
+        // Reset
+        $this->decodedtext = '';
+        $this->filename = $filename;
+    }
+
+    function output($echo = false) { 
+        if($echo) echo $this->decodedtext;
+        else return $this->decodedtext;
+    }
+
+    function setUnicode($input) { 
+        // 4 for unicode. But 2 should work in most cases just fine
+        if($input == true) $this->multibyte = 4;
+        else $this->multibyte = 2;
+    }
+
+    function decodePDF() { 
+        // Read the data from pdf file
+        $infile = @file_get_contents($this->filename, FILE_BINARY); 
+        if (empty($infile)) 
+            return ""; 
+    
+        // Get all text data.
+        $transformations = array(); 
+        $texts = array(); 
+    
+        // Get the list of all objects.
+        preg_match_all("#obj[\n|\r](.*)endobj[\n|\r]#ismU", $infile, $objects); 
+        $objects = @$objects[1]; 
+    
+        // Select objects with streams.
+        for ($i = 0; $i < count($objects); $i++) { 
+            $currentObject = $objects[$i]; 
+    
+            // Check if an object includes data stream.
+            if (preg_match("#stream[\n|\r](.*)endstream[\n|\r]#ismU", $currentObject, $stream)) { 
+                $stream = ltrim($stream[1]); 
+    
+                // Check object parameters and look for text data. 
+                $options = $this->getObjectOptions($currentObject); 
+    
+                if (!(empty($options["Length1"]) && empty($options["Type"]) && empty($options["Subtype"]))) 
+                    continue; 
+    
+                // Hack, length doesnt always seem to be correct
+                unset($options["Length"]);
+    
+                // So, we have text data. Decode it.
+                $data = $this->getDecodedStream($stream, $options);  
+    
+                if (strlen($data)) { 
+                    if (preg_match_all("#BT[\n|\r](.*)ET[\n|\r]#ismU", $data, $textContainers)) {
+                        $textContainers = @$textContainers[1]; 
+                        $this->getDirtyTexts($texts, $textContainers); 
+                    } else 
+                        $this->getCharTransformations($transformations, $data); 
+                } 
+            } 
+        } 
+    
+        // Analyze text blocks taking into account character transformations and return results. 
+        $this->decodedtext = $this->getTextUsingTransformations($texts, $transformations); 
+    }
+
+
+    function decodeAsciiHex($input) {
+        $output = "";
+    
+        $isOdd = true;
+        $isComment = false;
+    
+        for($i = 0, $codeHigh = -1; $i < strlen($input) && $input[$i] != '>'; $i++) {
+            $c = $input[$i];
+    
+            if($isComment) {
+                if ($c == '\r' || $c == '\n')
+                    $isComment = false;
+                continue;
+            }
+    
+            switch($c) {
+                case '\0': case '\t': case '\r': case '\f': case '\n': case ' ': break;
+                case '%': 
+                    $isComment = true;
+                break;
+    
+                default:
+                    $code = hexdec($c);
+                    if($code === 0 && $c != '0')
+                        return "";
+    
+                    if($isOdd)
+                        $codeHigh = $code;
+                    else
+                        $output .= chr($codeHigh * 16 + $code);
+    
+                    $isOdd = !$isOdd;
+                break;
+            }
+        }
+    
+        if($input[$i] != '>')
+            return "";
+    
+        if($isOdd)
+            $output .= chr($codeHigh * 16);
+    
+        return $output;
+    }
+    
+    function decodeAscii85($input) {
+        $output = "";
+    
+        $isComment = false;
+        $ords = array();
+        
+        for($i = 0, $state = 0; $i < strlen($input) && $input[$i] != '~'; $i++) {
+            $c = $input[$i];
+    
+            if($isComment) {
+                if ($c == '\r' || $c == '\n')
+                    $isComment = false;
+                continue;
+            }
+    
+            if ($c == '\0' || $c == '\t' || $c == '\r' || $c == '\f' || $c == '\n' || $c == ' ')
+                continue;
+            if ($c == '%') {
+                $isComment = true;
+                continue;
+            }
+            if ($c == 'z' && $state === 0) {
+                $output .= str_repeat(chr(0), 4);
+                continue;
+            }
+            if ($c < '!' || $c > 'u')
+                return "";
+    
+            $code = ord($input[$i]) & 0xff;
+            $ords[$state++] = $code - ord('!');
+    
+            if ($state == 5) {
+                $state = 0;
+                for ($sum = 0, $j = 0; $j < 5; $j++)
+                    $sum = $sum * 85 + $ords[$j];
+                for ($j = 3; $j >= 0; $j--)
+                    $output .= chr($sum >> ($j * 8));
+            }
+        }
+        if ($state === 1)
+            return "";
+        elseif ($state > 1) {
+            for ($i = 0, $sum = 0; $i < $state; $i++)
+                $sum += ($ords[$i] + ($i == $state - 1)) * pow(85, 4 - $i);
+            for ($i = 0; $i < $state - 1; $i++)
+                $ouput .= chr($sum >> ((3 - $i) * 8));
+        }
+    
+        return $output;
+    }
+    
+    function decodeFlate($input) {
+        return gzuncompress($input);
+    }
+    
+    function getObjectOptions($object) {
+        $options = array();
+
+        if (preg_match("#<<(.*)>>#ismU", $object, $options)) {
+            $options = explode("/", $options[1]);
+            @array_shift($options);
+    
+            $o = array();
+            for ($j = 0; $j < @count($options); $j++) {
+                $options[$j] = preg_replace("#\s+#", " ", trim($options[$j]));
+                if (strpos($options[$j], " ") !== false) {
+                    $parts = explode(" ", $options[$j]);
+                    $o[$parts[0]] = $parts[1];
+                } else
+                    $o[$options[$j]] = true;
+            }
+            $options = $o;
+            unset($o);
+        }
+    
+        return $options;
+    }
+    
+    function getDecodedStream($stream, $options) {
+        $data = "";
+        if (empty($options["Filter"]))
+            $data = $stream;
+        else {
+            $length = !empty($options["Length"]) ? $options["Length"] : strlen($stream);
+            $_stream = substr($stream, 0, $length);
+    
+            foreach ($options as $key => $value) {
+                if ($key == "ASCIIHexDecode")
+                    $_stream = $this->decodeAsciiHex($_stream);
+                if ($key == "ASCII85Decode")
+                    $_stream = $this->decodeAscii85($_stream);
+                if ($key == "FlateDecode")
+                    $_stream = $this->decodeFlate($_stream);
+                if ($key == "Crypt") { // TO DO
+                }
+            }
+            $data = $_stream;
+        }
+        return $data;
+    }
+    function getDirtyTexts(&$texts, $textContainers) {
+        
+        for ($j = 0; $j < count($textContainers); $j++) {
+            if (preg_match_all("#\[(.*)\]\s*TJ[\n|\r]#ismU", $textContainers[$j], $parts))
+                $texts = array_merge($texts, @$parts[1]);
+            elseif(preg_match_all("#T[d|w|m|f]\s*(\(.*\))\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts))
+                $texts = array_merge($texts, @$parts[1]);
+            elseif(preg_match_all("#T[d|w|m|f]\s*(\[.*\])\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts))
+                $texts = array_merge($texts, @$parts[1]);
+        }
+    }
+    function getCharTransformations(&$transformations, $stream) {
+        preg_match_all("#([0-9]+)\s+beginbfchar(.*)endbfchar#ismU", $stream, $chars, PREG_SET_ORDER);
+        preg_match_all("#([0-9]+)\s+beginbfrange(.*)endbfrange#ismU", $stream, $ranges, PREG_SET_ORDER);
+    
+        for ($j = 0; $j < count($chars); $j++) {
+            $count = $chars[$j][1];
+            $current = explode("\n", trim($chars[$j][2]));
+            for ($k = 0; $k < $count && $k < count($current); $k++) {
+                if (preg_match("#<([0-9a-f]{2,4})>\s+<([0-9a-f]{4,512})>#is", trim($current[$k]), $map))
+                    $transformations[str_pad($map[1], 4, "0")] = $map[2];
+            }
+        }
+        for ($j = 0; $j < count($ranges); $j++) {
+            $count = $ranges[$j][1];
+            $current = explode("\n", trim($ranges[$j][2]));
+            for ($k = 0; $k < $count && $k < count($current); $k++) {
+                if (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+<([0-9a-f]{4})>#is", trim($current[$k]), $map)) {
+                    $from = hexdec($map[1]);
+                    $to = hexdec($map[2]);
+                    $_from = hexdec($map[3]);
+    
+                    for ($m = $from, $n = 0; $m <= $to; $m++, $n++)
+                        $transformations[sprintf("%04X", $m)] = sprintf("%04X", $_from + $n);
+                } elseif (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+\[(.*)\]#ismU", trim($current[$k]), $map)) {
+                    $from = hexdec($map[1]);
+                    $to = hexdec($map[2]);
+                    $parts = preg_split("#\s+#", trim($map[3]));
+                    
+                    for ($m = $from, $n = 0; $m <= $to && $n < count($parts); $m++, $n++)
+                        $transformations[sprintf("%04X", $m)] = sprintf("%04X", hexdec($parts[$n]));
+                }
+            }
+        }
+    }
+    function getTextUsingTransformations($texts, $transformations) {
+        $document = "";
+        for ($i = 0; $i < count($texts); $i++) {
+            $isHex = false;
+            $isPlain = false;
+    
+            $hex = "";
+            $plain = "";
+            for ($j = 0; $j < strlen($texts[$i]); $j++) {
+                $c = $texts[$i][$j];
+                switch($c) {
+                    case "<":
+                        $hex = "";
+                        $isHex = true;
+                    break;
+                    case ">":
+                        $hexs = str_split($hex, $this->multibyte); // 2 or 4 (UTF8 or ISO)
+                        for ($k = 0; $k < count($hexs); $k++) {
+                            $chex = str_pad($hexs[$k], 4, "0"); // Add tailing zero
+                            if (isset($transformations[$chex]))
+                                $chex = $transformations[$chex];
+                            $document .= html_entity_decode("&#x".$chex.";");
+                        }
+                        $isHex = false;
+                    break;
+                    case "(":
+                        $plain = "";
+                        $isPlain = true;
+                    break;
+                    case ")":
+                        $document .= $plain;
+                        $isPlain = false;
+                    break;
+                    case "\\":
+                        $c2 = $texts[$i][$j + 1];
+                        if (in_array($c2, array("\\", "(", ")"))) $plain .= $c2;
+                        elseif ($c2 == "n") $plain .= '\n';
+                        elseif ($c2 == "r") $plain .= '\r';
+                        elseif ($c2 == "t") $plain .= '\t';
+                        elseif ($c2 == "b") $plain .= '\b';
+                        elseif ($c2 == "f") $plain .= '\f';
+                        elseif ($c2 >= '0' && $c2 <= '9') {
+                            $oct = preg_replace("#[^0-9]#", "", substr($texts[$i], $j + 1, 3));
+                            $j += strlen($oct) - 1;
+                            $plain .= html_entity_decode("&#".octdec($oct).";", $this->convertquotes);
+                        }
+                        $j++;
+                    break;
+    
+                    default:
+                        if ($isHex)
+                            $hex .= $c;
+                        if ($isPlain)
+                            $plain .= $c;
+                    break;
+                }
+            }
+            $document .= "\n";
+        }
+    
+        return $document;
+    }
+}
+?>
\ No newline at end of file

From d5d50d7d605a3b486fb61d7897490091d199bb07 Mon Sep 17 00:00:00 2001
From: vnmedeiros <vnicius.nm.ba@gmail.com>
Date: Fri, 6 Sep 2019 16:24:55 -0300
Subject: [PATCH 02/12] add and remove content on metadata if is a pdf document
 on item #245

---
 src/classes/class-tainacan-media.php | 33 +++++++++++++++++++++-------
 src/classes/tainacan-creator.php     |  1 +
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/src/classes/class-tainacan-media.php b/src/classes/class-tainacan-media.php
index b8fba368b..584a93302 100644
--- a/src/classes/class-tainacan-media.php
+++ b/src/classes/class-tainacan-media.php
@@ -228,11 +228,25 @@ class Media {
 			throw new \Exception("fatal error");
 	}
 	
-	public index_pdf_content($file, $item_id) {
-		
+	public function index_pdf_content($file, $item_id) {
+
+		$content_index_meta = '_pdf_content_index';
+		if (defined('TAINACAN_CONTENT_PDF_INDEX_METADATA')) {
+			$content_index_meta = TAINACAN_CONTENT_PDF_INDEX_METADATA;
+		}
+
+		if ($file == null) {
+			$meta_id = update_post_meta( $item_id, $content_index_meta, null );
+			return true;
+		}
+
 		if ( ! \file_exists($file) ) {
 			return false;
 		}
+
+		if ( $this->get_mime_content_type($file) != 'application/pdf') {
+			return null;
+		}
 		
 		// Allow plugins to implement other approach to index pdf contents 
 		$alternate = apply_filters('tainacan-index-pdf', null, $file, $item_id);
@@ -240,18 +254,21 @@ class Media {
 			return $alternate;
 		}
 		
-		require_once( TAINACAN_CLASSES_DIR . '/lib/class-pdf2text.php' );
 		
-		$PDF2Text = new PDF2Text();
+
+		$PDF2Text = new \PDF2Text();
 		$PDF2Text->setFilename($file);
-		
 		try {
 			$PDF2Text->decodePDF();
-			update_post_meta( $item_id, '_pdf_index', $PDF2Text->output() );
-		} catch($e) {
+			$content = preg_replace('/[^a-zA-Z0-9_ -]/s','',$PDF2Text->output()); // melhorar essa expresão regular
+			//$content = filter_var ( $PDF2Text->output(), FILTER_SANITIZE_STRING);
+			//$content = iconv('ISO-8859-1', 'UTF-8//TRANSLIT//IGNORE', $PDF2Text->output());
+			//$content = preg_replace('/[\r\n\\n]+/', "\n", $content);
+			$meta_id = update_post_meta( $item_id, $content_index_meta, $content );
+		} catch(Exception $e) {
+			error_log('Caught exception: ' .  $e->getMessage() . "\n");
 			return false;
 		}
-		
 	}
 		
 }
\ No newline at end of file
diff --git a/src/classes/tainacan-creator.php b/src/classes/tainacan-creator.php
index 964f38ee8..5c513b00f 100644
--- a/src/classes/tainacan-creator.php
+++ b/src/classes/tainacan-creator.php
@@ -34,6 +34,7 @@ const DIRS = [
 
 require_once('libs/wp-async-request.php');
 require_once('libs/wp-background-process.php');
+require_once('libs/class-pdf2text.php');
 require_once('class-tainacan-background-process.php');
 require_once('tainacan-utils.php');
 require_once(TAINACAN_IMPORTER_DIR . 'class-tainacan-bg-importer.php');

From afc24ee50b9901dcf13ed71337a03ff7c954ed6b Mon Sep 17 00:00:00 2001
From: vnmedeiros <vnicius.nm.ba@gmail.com>
Date: Fri, 6 Sep 2019 16:25:11 -0300
Subject: [PATCH 03/12] add and remove content on metadata if is a pdf document
 on item # 245

---
 .../repositories/class-tainacan-items.php     | 22 +++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/classes/repositories/class-tainacan-items.php b/src/classes/repositories/class-tainacan-items.php
index 30ed2c549..ba4291e45 100644
--- a/src/classes/repositories/class-tainacan-items.php
+++ b/src/classes/repositories/class-tainacan-items.php
@@ -376,6 +376,25 @@ class Items extends Repository {
 		return $where;
 	}
 	
+	/**
+	 * generate a content of document to index.
+	 *
+	 * @param  Entities\Item $item The item
+	 *
+	 * @return boolean
+	 */
+	public function generate_index_content(Entities\Item $item) {
+		$TainacanMedia = \Tainacan\Media::get_instance();
+		if ( empty( $item->get_document() ) ) {
+			$TainacanMedia->index_pdf_content( null, $item->get_ID() );
+		} elseif ( $item->get_document_type() == 'attachment' ) {
+			if (! wp_attachment_is_image( $item->get_document() ) ) {
+				$filepath = get_attached_file( $item->get_document() );
+				$TainacanMedia->index_pdf_content( $filepath, $item->get_ID() );
+			}
+		}
+		return true;
+	}
 
 	/**
 	 * Get a default thumbnail ID from the item document.
@@ -452,13 +471,12 @@ class Items extends Repository {
 		) {
 
 			$thumb_id = $this->get_thumbnail_id_from_document( $updated_item );
-
 			if ( ! is_null( $thumb_id ) ) {
 				set_post_thumbnail( $updated_item->get_id(), (int) $thumb_id );
 			}
 
 		}
-
+		$this->generate_index_content( $updated_item );
 	}
 
 	/**

From f76c706896481a1c68c7b443a8b3b2d626a289a7 Mon Sep 17 00:00:00 2001
From: vnmedeiros <vnicius.nm.ba@gmail.com>
Date: Tue, 10 Sep 2019 13:14:28 -0300
Subject: [PATCH 04/12] add option to advanced search indexed content documents
 #245

---
 src/admin/components/advanced-search/advanced-search.vue | 3 ++-
 src/classes/class-tainacan-media.php                     | 9 ++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/admin/components/advanced-search/advanced-search.vue b/src/admin/components/advanced-search/advanced-search.vue
index ecd72a979..7526288af 100644
--- a/src/admin/components/advanced-search/advanced-search.vue
+++ b/src/admin/components/advanced-search/advanced-search.vue
@@ -54,6 +54,7 @@
                                     :value="`${metadatum.id}-${metadatum.metadata_type_options.taxonomy}-${metadatum.metadata_type_object.primitive_type}`"
                                     :key="metadatum.id"
                             >{{ metadatum.name }}</option>
+                            <option value="_document_content_index-undefined-string">Documento</option>
                         </b-select>
                     </b-field>
 
@@ -533,7 +534,7 @@
                         } else {
                             this.advancedSearchQuery.metaquery = Object.assign({}, this.advancedSearchQuery.metaquery, {
                                 [`${searchCriterion}`]: {
-                                    key: Number(criteriaKey[0]),
+                                    key: criteriaKey[0],
                                     compare: '=',
                                     originalMeta: value,
                                 }
diff --git a/src/classes/class-tainacan-media.php b/src/classes/class-tainacan-media.php
index 584a93302..aac90529d 100644
--- a/src/classes/class-tainacan-media.php
+++ b/src/classes/class-tainacan-media.php
@@ -230,7 +230,7 @@ class Media {
 	
 	public function index_pdf_content($file, $item_id) {
 
-		$content_index_meta = '_pdf_content_index';
+    $content_index_meta = '_document_content_index';
 		if (defined('TAINACAN_CONTENT_PDF_INDEX_METADATA')) {
 			$content_index_meta = TAINACAN_CONTENT_PDF_INDEX_METADATA;
 		}
@@ -253,14 +253,13 @@ class Media {
 		if ( ! \is_null($alternate) ) {
 			return $alternate;
 		}
-		
-		
 
 		$PDF2Text = new \PDF2Text();
 		$PDF2Text->setFilename($file);
 		try {
-			$PDF2Text->decodePDF();
-			$content = preg_replace('/[^a-zA-Z0-9_ -]/s','',$PDF2Text->output()); // melhorar essa expresão regular
+      $PDF2Text->decodePDF();
+      //$content = $PDF2Text->output(); // melhorar essa expresão regular
+      $content = preg_replace('/[^a-zA-Z0-9_ -]/s','',$PDF2Text->output()); // melhorar essa expresão regular
 			//$content = filter_var ( $PDF2Text->output(), FILTER_SANITIZE_STRING);
 			//$content = iconv('ISO-8859-1', 'UTF-8//TRANSLIT//IGNORE', $PDF2Text->output());
 			//$content = preg_replace('/[\r\n\\n]+/', "\n", $content);

From 912dc51585ff134f4c94099ff45986a6591871f4 Mon Sep 17 00:00:00 2001
From: vnmedeiros <vnicius.nm.ba@gmail.com>
Date: Tue, 10 Sep 2019 16:05:07 -0300
Subject: [PATCH 05/12] fix encoding and regular expression #245

---
 src/classes/class-tainacan-media.php | 102 +++++++++++++--------------
 1 file changed, 50 insertions(+), 52 deletions(-)

diff --git a/src/classes/class-tainacan-media.php b/src/classes/class-tainacan-media.php
index aac90529d..2412b4ede 100644
--- a/src/classes/class-tainacan-media.php
+++ b/src/classes/class-tainacan-media.php
@@ -10,13 +10,13 @@ class Media {
 	private static $file_handle = null;
 	private static $file_name = null;
 
-    public static function get_instance() {
-        if(!isset(self::$instance)) {
-            self::$instance = new self();
-        }
+		public static function get_instance() {
+				if(!isset(self::$instance)) {
+						self::$instance = new self();
+				}
 
-        return self::$instance;
-    }
+				return self::$instance;
+		}
 
 	/**
 	 * Insert an attachment from an URL address.
@@ -59,58 +59,58 @@ class Media {
 		
 	}
 
-    /**
-     * Avoid memory overflow problems with large files (Exceeded maximum memory limit of PHP)
-     *
-     * @param $url
-     * @return string the file path
-     */
-    public function save_remote_file($url) {
+		/**
+		 * Avoid memory overflow problems with large files (Exceeded maximum memory limit of PHP)
+		 *
+		 * @param $url
+		 * @return string the file path
+		 */
+		public function save_remote_file($url) {
 
-        set_time_limit(0);
+				set_time_limit(0);
 
-        $filename = tempnam(sys_get_temp_dir(), basename($url));
+				$filename = tempnam(sys_get_temp_dir(), basename($url));
 
-        # Open the file for writing...
-        self::$file_handle = fopen($filename, 'w+');
-        self::$file_name = $filename;
+				# Open the file for writing...
+				self::$file_handle = fopen($filename, 'w+');
+				self::$file_name = $filename;
 
-        $callback = function ($ch, $str)  {
-            $len = fwrite(self::$file_handle, $str);
-            return $len;
-        };
+				$callback = function ($ch, $str)  {
+						$len = fwrite(self::$file_handle, $str);
+						return $len;
+				};
 
-        $ch = curl_init();
-        curl_setopt($ch, CURLOPT_URL, $url);
-        curl_setopt($ch, CURLOPT_FILE, self::$file_handle);
-        curl_setopt($ch, CURLOPT_HEADER, 0);
-        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
-        curl_setopt($ch, CURLOPT_BINARYTRANSFER, true);
-        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); # optional
-        curl_setopt($ch, CURLOPT_TIMEOUT, -1); # optional: -1 = unlimited, 3600 = 1 hour
-        curl_setopt($ch, CURLOPT_VERBOSE, false); # Set to true to see all the innards
+				$ch = curl_init();
+				curl_setopt($ch, CURLOPT_URL, $url);
+				curl_setopt($ch, CURLOPT_FILE, self::$file_handle);
+				curl_setopt($ch, CURLOPT_HEADER, 0);
+				curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
+				curl_setopt($ch, CURLOPT_BINARYTRANSFER, true);
+				curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); # optional
+				curl_setopt($ch, CURLOPT_TIMEOUT, -1); # optional: -1 = unlimited, 3600 = 1 hour
+				curl_setopt($ch, CURLOPT_VERBOSE, false); # Set to true to see all the innards
 
-        # Only if you need to bypass SSL certificate validation
-        curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
-        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
+				# Only if you need to bypass SSL certificate validation
+				curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
+				curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
 
-        # Assign a callback function to the CURL Write-Function
-        curl_setopt($ch, CURLOPT_WRITEFUNCTION, $callback);
+				# Assign a callback function to the CURL Write-Function
+				curl_setopt($ch, CURLOPT_WRITEFUNCTION, $callback);
 
-        # Exceute the download - note we DO NOT put the result into a variable!
-        curl_exec($ch);
+				# Exceute the download - note we DO NOT put the result into a variable!
+				curl_exec($ch);
 
-        # Close CURL
-        curl_close($ch);
+				# Close CURL
+				curl_close($ch);
 
-        # Close the file pointer
-        fclose(self::$file_handle);
+				# Close the file pointer
+				fclose(self::$file_handle);
 
-        return $filename;
-    }
+				return $filename;
+		}
 
 
-    /**
+		/**
 	 * Insert an attachment from an URL address.
 	 *
 	 * @param  blob $blob bitstream of the attachment
@@ -230,7 +230,7 @@ class Media {
 	
 	public function index_pdf_content($file, $item_id) {
 
-    $content_index_meta = '_document_content_index';
+		$content_index_meta = '_document_content_index';
 		if (defined('TAINACAN_CONTENT_PDF_INDEX_METADATA')) {
 			$content_index_meta = TAINACAN_CONTENT_PDF_INDEX_METADATA;
 		}
@@ -255,14 +255,12 @@ class Media {
 		}
 
 		$PDF2Text = new \PDF2Text();
+		$PDF2Text->setUnicode(true);
 		$PDF2Text->setFilename($file);
 		try {
-      $PDF2Text->decodePDF();
-      //$content = $PDF2Text->output(); // melhorar essa expresão regular
-      $content = preg_replace('/[^a-zA-Z0-9_ -]/s','',$PDF2Text->output()); // melhorar essa expresão regular
-			//$content = filter_var ( $PDF2Text->output(), FILTER_SANITIZE_STRING);
-			//$content = iconv('ISO-8859-1', 'UTF-8//TRANSLIT//IGNORE', $PDF2Text->output());
-			//$content = preg_replace('/[\r\n\\n]+/', "\n", $content);
+			$PDF2Text->decodePDF();
+			$content = preg_replace('~[[:cntrl:]]~', '', $PDF2Text->output());
+			$content = mb_convert_encoding($content, 'UTF-8', 'ISO-8859-1');
 			$meta_id = update_post_meta( $item_id, $content_index_meta, $content );
 		} catch(Exception $e) {
 			error_log('Caught exception: ' .  $e->getMessage() . "\n");

From 3cc9172d597af28cf9a513166ceb415198a29bd4 Mon Sep 17 00:00:00 2001
From: vnmedeiros <vnicius.nm.ba@gmail.com>
Date: Tue, 10 Sep 2019 16:27:31 -0300
Subject: [PATCH 06/12] remove the Number constructor in frontend #245

---
 src/admin/components/advanced-search/advanced-search.vue | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/admin/components/advanced-search/advanced-search.vue b/src/admin/components/advanced-search/advanced-search.vue
index 7526288af..76c22f74c 100644
--- a/src/admin/components/advanced-search/advanced-search.vue
+++ b/src/admin/components/advanced-search/advanced-search.vue
@@ -526,7 +526,7 @@
                         if(criteriaKey[2] != 'date' && criteriaKey[2] != 'int' && criteriaKey[2] != 'float'){
                             this.advancedSearchQuery.metaquery = Object.assign({}, this.advancedSearchQuery.metaquery, {
                                 [`${searchCriterion}`]: {
-                                    key: Number(criteriaKey[0]),
+                                    key: criteriaKey[0],
                                     compare: 'LIKE',
                                     originalMeta: value,
                                 }
@@ -534,7 +534,7 @@
                         } else {
                             this.advancedSearchQuery.metaquery = Object.assign({}, this.advancedSearchQuery.metaquery, {
                                 [`${searchCriterion}`]: {
-                                    key: criteriaKey[0],
+                                    key: Number(criteriaKey[0]),
                                     compare: '=',
                                     originalMeta: value,
                                 }

From 005a304edbbe220baaf166944ec777bf66168a27 Mon Sep 17 00:00:00 2001
From: vnmedeiros <vnicius.nm.ba@gmail.com>
Date: Wed, 11 Sep 2019 15:32:08 -0300
Subject: [PATCH 07/12] index document content using the wp-cli #245

---
 src/cli/class-tainacan-cli-document.php | 119 ++++++++++++++++++++++++
 src/cli/class-tainacan-cli.php          |   3 +-
 2 files changed, 121 insertions(+), 1 deletion(-)
 create mode 100644 src/cli/class-tainacan-cli-document.php

diff --git a/src/cli/class-tainacan-cli-document.php b/src/cli/class-tainacan-cli-document.php
new file mode 100644
index 000000000..0e930446e
--- /dev/null
+++ b/src/cli/class-tainacan-cli-document.php
@@ -0,0 +1,119 @@
+<?php 
+
+namespace Tainacan;
+
+use WP_CLI;
+use Tainacan\Repositories;
+
+class Cli_Document {
+
+	private $collection_repository;
+	private $items_repository;
+	private $result_count;
+	private $dry_run = false;
+	
+	public function __construct() {
+		$this->items_repository = Repositories\Items::get_instance();
+		$this->collection_repository = Repositories\Collections::get_instance();
+		$this->result_count = ['indexed_documents' => 0];
+	} 
+
+	/**
+	 * index content of documents
+	 *
+	 * ## OPTIONS
+	 * [--collection-id=<value>]
+	 * : <value> Specific ID of the collection into which the document content of the items will be indexed, if not informed all collections will be index.
+	 * 
+	 * 
+	 * [--dry-run]
+	 * : only count the total of item which will index, just output a report 
+	 * 
+	 * ## EXAMPLES
+	 * 
+	 * wp tainacan index-content --collection-id=416
+	 * indexing documents of items to collection 416:  100% [====================================================] 0:00 / 0:00
+	 * Success: 
+	 * 7 items indexed
+	 * 
+	 * 
+	 * wp tainacan index-content
+	 * indexing documents of items to collection 416:  100% [====================================================] 0:00 / 0:00
+	 * Success: 
+	 * 7 items indexed
+	 * indexing documents of items to collection 301:  100% [====================================================] 0:00 / 0:00
+	 * Success: 
+	 * 10 items indexed
+	 * 
+	 */
+	public function __invoke($args, $assoc_args) {
+		$this->dry_run = false;
+		if ( !empty($assoc_args['dry-run']) ) {
+			$this->dry_run = true;
+		}
+
+		if( empty($assoc_args['collection-id']) ) {
+			$this->index_item_all_collections();
+		} else {
+			$collection_id = $assoc_args['collection-id'];
+			$this->index_item($collection_id);
+		}
+	}
+
+	private function index_item_all_collections() {
+		$collections = $this->collection_repository->fetch(['posts_per_page'=>-1], 'OBJECT');
+		foreach ($collections as $collection) {
+			$this->result_count['indexed_documents'] = 0;
+			$this->index_item($collection->get_id());
+		}
+	}
+
+	private function index_item($collection_id) {
+		$per_page = 50; $page = 1;
+		$args = [
+			'posts_per_page'=> $per_page,
+			'paged' => $page,
+			'post_status' => get_post_stati()
+		];
+		$collection_items = $this->items_repository->fetch($args, $collection_id, 'WP_Query');
+		$total = $collection_items->found_posts;
+		$last_page = ceil($total/$per_page);
+
+		$progress = \WP_CLI\Utils\make_progress_bar( "indexing documents of items to collection $collection_id:", $total );
+		while ($page++ <= $last_page) {
+			if ($collection_items->have_posts()) {
+				while ( $collection_items->have_posts() ) {
+					$collection_items->the_post();
+					$item = new Entities\Item($collection_items->post);
+					$this->index_content_document_item($item);
+					$progress->tick();
+				}
+			}
+			$args['paged'] = $page;
+			$collection_items = $this->items_repository->fetch($args, $collection_id, 'WP_Query');
+		}
+		$progress->finish();
+
+		$msg = "\n" . $this->result_count['indexed_documents'] . " items indexed";
+
+		\WP_CLI::success( $msg );
+	}
+
+	private function index_content_document_item($item) {
+		if (! $item instanceof Entities\Item) {
+			\WP_CLI::error( 'An item with this ID was not found', true );
+		}
+
+		if ( empty( $item->get_document() ) ) {
+			return null;
+		}
+
+		$this->result_count['indexed_documents']++;
+		if ($this->dry_run)
+			return true;
+		return $this->items_repository->generate_index_content($item);
+	}
+}
+
+
+ ?>
\ No newline at end of file
diff --git a/src/cli/class-tainacan-cli.php b/src/cli/class-tainacan-cli.php
index a391abda5..16409f6ce 100644
--- a/src/cli/class-tainacan-cli.php
+++ b/src/cli/class-tainacan-cli.php
@@ -28,7 +28,8 @@ class Cli {
 		
 		\WP_CLI::add_command('tainacan garbage-collector', 'Tainacan\Cli_Garbage_Collector');
 		\WP_CLI::add_command('tainacan move-attachments-to-items-folder', 'Tainacan\Cli_Move_Attachments');
-		\WP_CLI::add_command('tainacan collection', 'Tainacan\Cli_Collection');
+    \WP_CLI::add_command('tainacan collection', 'Tainacan\Cli_Collection');
+    \WP_CLI::add_command('tainacan index-content', 'Tainacan\Cli_Document');
 	}
 	
 	

From dbec21d502a8e06ab9041d2fe642a85381511e7e Mon Sep 17 00:00:00 2001
From: vnmedeiros <vnicius.nm.ba@gmail.com>
Date: Wed, 11 Sep 2019 23:53:47 -0300
Subject: [PATCH 08/12] improvements on detect encode and WP-Cli #245

---
 src/classes/class-tainacan-media.php    |  6 +++++-
 src/cli/class-tainacan-cli-document.php | 18 +++++++++++-------
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/classes/class-tainacan-media.php b/src/classes/class-tainacan-media.php
index 2412b4ede..730ca9e7f 100644
--- a/src/classes/class-tainacan-media.php
+++ b/src/classes/class-tainacan-media.php
@@ -260,7 +260,11 @@ class Media {
 		try {
 			$PDF2Text->decodePDF();
 			$content = preg_replace('~[[:cntrl:]]~', '', $PDF2Text->output());
-			$content = mb_convert_encoding($content, 'UTF-8', 'ISO-8859-1');
+
+			$wp_charset = get_bloginfo('charset');
+			$content_charset = mb_detect_encoding($content);
+			$content = mb_convert_encoding($content, $wp_charset, $content_charset);
+
 			$meta_id = update_post_meta( $item_id, $content_index_meta, $content );
 		} catch(Exception $e) {
 			error_log('Caught exception: ' .  $e->getMessage() . "\n");
diff --git a/src/cli/class-tainacan-cli-document.php b/src/cli/class-tainacan-cli-document.php
index 0e930446e..fe7256040 100644
--- a/src/cli/class-tainacan-cli-document.php
+++ b/src/cli/class-tainacan-cli-document.php
@@ -22,8 +22,8 @@ class Cli_Document {
 	 * index content of documents
 	 *
 	 * ## OPTIONS
-	 * [--collection-id=<value>]
-	 * : <value> Specific ID of the collection into which the document content of the items will be indexed, if not informed all collections will be index.
+	 * [--collection=<value>]
+	 * : <value> Specific ID of the collection into which the document content of the items will be indexed, or 'all' to all collections.
 	 * 
 	 * 
 	 * [--dry-run]
@@ -31,13 +31,13 @@ class Cli_Document {
 	 * 
 	 * ## EXAMPLES
 	 * 
-	 * wp tainacan index-content --collection-id=416
+	 * wp tainacan index-content --collection=416
 	 * indexing documents of items to collection 416:  100% [====================================================] 0:00 / 0:00
 	 * Success: 
 	 * 7 items indexed
 	 * 
 	 * 
-	 * wp tainacan index-content
+	 * wp tainacan index-content --collection=all
 	 * indexing documents of items to collection 416:  100% [====================================================] 0:00 / 0:00
 	 * Success: 
 	 * 7 items indexed
@@ -52,11 +52,15 @@ class Cli_Document {
 			$this->dry_run = true;
 		}
 
-		if( empty($assoc_args['collection-id']) ) {
+		if( empty($assoc_args['collection']) ) {
+			\WP_CLI::error( 'Wrong parameters', true );
+		}
+
+		$collection = $assoc_args['collection'];
+		if ($collection == 'all') {
 			$this->index_item_all_collections();
 		} else {
-			$collection_id = $assoc_args['collection-id'];
-			$this->index_item($collection_id);
+			$this->index_item($collection);
 		}
 	}
 

From fc7bb5250d160184d0068518ac5912c4deede47c Mon Sep 17 00:00:00 2001
From: vnmedeiros <vnicius.nm.ba@gmail.com>
Date: Wed, 18 Sep 2019 17:48:46 -0300
Subject: [PATCH 09/12] add the lib "PDF Parser" and change to use it

---
 composer.json                        |   3 +-
 src/classes/class-tainacan-media.php |  11 +-
 src/classes/libs/class-pdf2text.php  | 377 ---------------------------
 src/classes/tainacan-creator.php     |   1 -
 4 files changed, 6 insertions(+), 386 deletions(-)
 delete mode 100644 src/classes/libs/class-pdf2text.php

diff --git a/composer.json b/composer.json
index ae246dec5..d03574e01 100644
--- a/composer.json
+++ b/composer.json
@@ -3,7 +3,8 @@
     "description": "Transforme seu site wordpress em um repositório digital.",
     "type": "wordpress-plugin",
     "require": {
-        "respect/validation": "^1.1"
+        "respect/validation": "^1.1",
+        "smalot/pdfparser": "*"
     },
     "require-dev": {
         "squizlabs/php_codesniffer": "^2.2 || ^3.0.2",
diff --git a/src/classes/class-tainacan-media.php b/src/classes/class-tainacan-media.php
index 730ca9e7f..0e8ad8c3d 100644
--- a/src/classes/class-tainacan-media.php
+++ b/src/classes/class-tainacan-media.php
@@ -229,7 +229,7 @@ class Media {
 	}
 	
 	public function index_pdf_content($file, $item_id) {
-
+		
 		$content_index_meta = '_document_content_index';
 		if (defined('TAINACAN_CONTENT_PDF_INDEX_METADATA')) {
 			$content_index_meta = TAINACAN_CONTENT_PDF_INDEX_METADATA;
@@ -239,7 +239,7 @@ class Media {
 			$meta_id = update_post_meta( $item_id, $content_index_meta, null );
 			return true;
 		}
-
+		
 		if ( ! \file_exists($file) ) {
 			return false;
 		}
@@ -254,12 +254,9 @@ class Media {
 			return $alternate;
 		}
 
-		$PDF2Text = new \PDF2Text();
-		$PDF2Text->setUnicode(true);
-		$PDF2Text->setFilename($file);
 		try {
-			$PDF2Text->decodePDF();
-			$content = preg_replace('~[[:cntrl:]]~', '', $PDF2Text->output());
+			$parser = new \Smalot\PdfParser\Parser();
+			$content    = $parser->parseFile($file)->getText();
 
 			$wp_charset = get_bloginfo('charset');
 			$content_charset = mb_detect_encoding($content);
diff --git a/src/classes/libs/class-pdf2text.php b/src/classes/libs/class-pdf2text.php
deleted file mode 100644
index c46b51691..000000000
--- a/src/classes/libs/class-pdf2text.php
+++ /dev/null
@@ -1,377 +0,0 @@
-<?php
-/*
-
-Source: https://gist.github.com/getive/78b62d42cebd79b5dfd6
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
-This code is an improved version of what can be found at:
-http://www.webcheatsheet.com/php/reading_clean_text_from_pdf.php
-
-AUTHOR:
-- Webcheatsheet.com (Original code)
-- Joeri Stegeman (joeri210 [at] yahoo [dot] com) (Class conversion and fixes/adjustments)
-
-DESCRIPTION:
-This is a class to convert PDF files into ASCII text or so called PDF text extraction.
-It will ignore anything that is not addressed as text within the PDF and any layout.
-Currently supported filters are: ASCIIHexDecode, ASCII85Decode, FlateDecode
-
-PURPOSE(S):
-Most likely for people that want their PDF to be searchable.
-
-SYNTAX:
-include('class.pdf2text.php');
-$a = new PDF2Text();
-$a->setFilename('test.pdf');
-$a->decodePDF();
-echo $a->output(); 
-
-ALTERNATIVES:
-Other excellent options to search within a PDF:
-- Apache PDFbox (http://pdfbox.apache.org/). An open source Java solution
-- pdflib TET (http://www.pdflib.com/products/tet/)
-- Online converter: http://snowtide.com/PDFTextStream
-*/
-
-
-class PDF2Text {
-    // Some settings
-    var $multibyte = 2; // Use setUnicode(TRUE|FALSE)
-    var $convertquotes = ENT_QUOTES; // ENT_COMPAT (double-quotes), ENT_QUOTES (Both), ENT_NOQUOTES (None)
-    
-    // Variables
-    var $filename = '';
-    var $decodedtext = '';
-    
-    function setFilename($filename) { 
-        // Reset
-        $this->decodedtext = '';
-        $this->filename = $filename;
-    }
-
-    function output($echo = false) { 
-        if($echo) echo $this->decodedtext;
-        else return $this->decodedtext;
-    }
-
-    function setUnicode($input) { 
-        // 4 for unicode. But 2 should work in most cases just fine
-        if($input == true) $this->multibyte = 4;
-        else $this->multibyte = 2;
-    }
-
-    function decodePDF() { 
-        // Read the data from pdf file
-        $infile = @file_get_contents($this->filename, FILE_BINARY); 
-        if (empty($infile)) 
-            return ""; 
-    
-        // Get all text data.
-        $transformations = array(); 
-        $texts = array(); 
-    
-        // Get the list of all objects.
-        preg_match_all("#obj[\n|\r](.*)endobj[\n|\r]#ismU", $infile, $objects); 
-        $objects = @$objects[1]; 
-    
-        // Select objects with streams.
-        for ($i = 0; $i < count($objects); $i++) { 
-            $currentObject = $objects[$i]; 
-    
-            // Check if an object includes data stream.
-            if (preg_match("#stream[\n|\r](.*)endstream[\n|\r]#ismU", $currentObject, $stream)) { 
-                $stream = ltrim($stream[1]); 
-    
-                // Check object parameters and look for text data. 
-                $options = $this->getObjectOptions($currentObject); 
-    
-                if (!(empty($options["Length1"]) && empty($options["Type"]) && empty($options["Subtype"]))) 
-                    continue; 
-    
-                // Hack, length doesnt always seem to be correct
-                unset($options["Length"]);
-    
-                // So, we have text data. Decode it.
-                $data = $this->getDecodedStream($stream, $options);  
-    
-                if (strlen($data)) { 
-                    if (preg_match_all("#BT[\n|\r](.*)ET[\n|\r]#ismU", $data, $textContainers)) {
-                        $textContainers = @$textContainers[1]; 
-                        $this->getDirtyTexts($texts, $textContainers); 
-                    } else 
-                        $this->getCharTransformations($transformations, $data); 
-                } 
-            } 
-        } 
-    
-        // Analyze text blocks taking into account character transformations and return results. 
-        $this->decodedtext = $this->getTextUsingTransformations($texts, $transformations); 
-    }
-
-
-    function decodeAsciiHex($input) {
-        $output = "";
-    
-        $isOdd = true;
-        $isComment = false;
-    
-        for($i = 0, $codeHigh = -1; $i < strlen($input) && $input[$i] != '>'; $i++) {
-            $c = $input[$i];
-    
-            if($isComment) {
-                if ($c == '\r' || $c == '\n')
-                    $isComment = false;
-                continue;
-            }
-    
-            switch($c) {
-                case '\0': case '\t': case '\r': case '\f': case '\n': case ' ': break;
-                case '%': 
-                    $isComment = true;
-                break;
-    
-                default:
-                    $code = hexdec($c);
-                    if($code === 0 && $c != '0')
-                        return "";
-    
-                    if($isOdd)
-                        $codeHigh = $code;
-                    else
-                        $output .= chr($codeHigh * 16 + $code);
-    
-                    $isOdd = !$isOdd;
-                break;
-            }
-        }
-    
-        if($input[$i] != '>')
-            return "";
-    
-        if($isOdd)
-            $output .= chr($codeHigh * 16);
-    
-        return $output;
-    }
-    
-    function decodeAscii85($input) {
-        $output = "";
-    
-        $isComment = false;
-        $ords = array();
-        
-        for($i = 0, $state = 0; $i < strlen($input) && $input[$i] != '~'; $i++) {
-            $c = $input[$i];
-    
-            if($isComment) {
-                if ($c == '\r' || $c == '\n')
-                    $isComment = false;
-                continue;
-            }
-    
-            if ($c == '\0' || $c == '\t' || $c == '\r' || $c == '\f' || $c == '\n' || $c == ' ')
-                continue;
-            if ($c == '%') {
-                $isComment = true;
-                continue;
-            }
-            if ($c == 'z' && $state === 0) {
-                $output .= str_repeat(chr(0), 4);
-                continue;
-            }
-            if ($c < '!' || $c > 'u')
-                return "";
-    
-            $code = ord($input[$i]) & 0xff;
-            $ords[$state++] = $code - ord('!');
-    
-            if ($state == 5) {
-                $state = 0;
-                for ($sum = 0, $j = 0; $j < 5; $j++)
-                    $sum = $sum * 85 + $ords[$j];
-                for ($j = 3; $j >= 0; $j--)
-                    $output .= chr($sum >> ($j * 8));
-            }
-        }
-        if ($state === 1)
-            return "";
-        elseif ($state > 1) {
-            for ($i = 0, $sum = 0; $i < $state; $i++)
-                $sum += ($ords[$i] + ($i == $state - 1)) * pow(85, 4 - $i);
-            for ($i = 0; $i < $state - 1; $i++)
-                $ouput .= chr($sum >> ((3 - $i) * 8));
-        }
-    
-        return $output;
-    }
-    
-    function decodeFlate($input) {
-        return gzuncompress($input);
-    }
-    
-    function getObjectOptions($object) {
-        $options = array();
-
-        if (preg_match("#<<(.*)>>#ismU", $object, $options)) {
-            $options = explode("/", $options[1]);
-            @array_shift($options);
-    
-            $o = array();
-            for ($j = 0; $j < @count($options); $j++) {
-                $options[$j] = preg_replace("#\s+#", " ", trim($options[$j]));
-                if (strpos($options[$j], " ") !== false) {
-                    $parts = explode(" ", $options[$j]);
-                    $o[$parts[0]] = $parts[1];
-                } else
-                    $o[$options[$j]] = true;
-            }
-            $options = $o;
-            unset($o);
-        }
-    
-        return $options;
-    }
-    
-    function getDecodedStream($stream, $options) {
-        $data = "";
-        if (empty($options["Filter"]))
-            $data = $stream;
-        else {
-            $length = !empty($options["Length"]) ? $options["Length"] : strlen($stream);
-            $_stream = substr($stream, 0, $length);
-    
-            foreach ($options as $key => $value) {
-                if ($key == "ASCIIHexDecode")
-                    $_stream = $this->decodeAsciiHex($_stream);
-                if ($key == "ASCII85Decode")
-                    $_stream = $this->decodeAscii85($_stream);
-                if ($key == "FlateDecode")
-                    $_stream = $this->decodeFlate($_stream);
-                if ($key == "Crypt") { // TO DO
-                }
-            }
-            $data = $_stream;
-        }
-        return $data;
-    }
-    function getDirtyTexts(&$texts, $textContainers) {
-        
-        for ($j = 0; $j < count($textContainers); $j++) {
-            if (preg_match_all("#\[(.*)\]\s*TJ[\n|\r]#ismU", $textContainers[$j], $parts))
-                $texts = array_merge($texts, @$parts[1]);
-            elseif(preg_match_all("#T[d|w|m|f]\s*(\(.*\))\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts))
-                $texts = array_merge($texts, @$parts[1]);
-            elseif(preg_match_all("#T[d|w|m|f]\s*(\[.*\])\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts))
-                $texts = array_merge($texts, @$parts[1]);
-        }
-    }
-    function getCharTransformations(&$transformations, $stream) {
-        preg_match_all("#([0-9]+)\s+beginbfchar(.*)endbfchar#ismU", $stream, $chars, PREG_SET_ORDER);
-        preg_match_all("#([0-9]+)\s+beginbfrange(.*)endbfrange#ismU", $stream, $ranges, PREG_SET_ORDER);
-    
-        for ($j = 0; $j < count($chars); $j++) {
-            $count = $chars[$j][1];
-            $current = explode("\n", trim($chars[$j][2]));
-            for ($k = 0; $k < $count && $k < count($current); $k++) {
-                if (preg_match("#<([0-9a-f]{2,4})>\s+<([0-9a-f]{4,512})>#is", trim($current[$k]), $map))
-                    $transformations[str_pad($map[1], 4, "0")] = $map[2];
-            }
-        }
-        for ($j = 0; $j < count($ranges); $j++) {
-            $count = $ranges[$j][1];
-            $current = explode("\n", trim($ranges[$j][2]));
-            for ($k = 0; $k < $count && $k < count($current); $k++) {
-                if (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+<([0-9a-f]{4})>#is", trim($current[$k]), $map)) {
-                    $from = hexdec($map[1]);
-                    $to = hexdec($map[2]);
-                    $_from = hexdec($map[3]);
-    
-                    for ($m = $from, $n = 0; $m <= $to; $m++, $n++)
-                        $transformations[sprintf("%04X", $m)] = sprintf("%04X", $_from + $n);
-                } elseif (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+\[(.*)\]#ismU", trim($current[$k]), $map)) {
-                    $from = hexdec($map[1]);
-                    $to = hexdec($map[2]);
-                    $parts = preg_split("#\s+#", trim($map[3]));
-                    
-                    for ($m = $from, $n = 0; $m <= $to && $n < count($parts); $m++, $n++)
-                        $transformations[sprintf("%04X", $m)] = sprintf("%04X", hexdec($parts[$n]));
-                }
-            }
-        }
-    }
-    function getTextUsingTransformations($texts, $transformations) {
-        $document = "";
-        for ($i = 0; $i < count($texts); $i++) {
-            $isHex = false;
-            $isPlain = false;
-    
-            $hex = "";
-            $plain = "";
-            for ($j = 0; $j < strlen($texts[$i]); $j++) {
-                $c = $texts[$i][$j];
-                switch($c) {
-                    case "<":
-                        $hex = "";
-                        $isHex = true;
-                    break;
-                    case ">":
-                        $hexs = str_split($hex, $this->multibyte); // 2 or 4 (UTF8 or ISO)
-                        for ($k = 0; $k < count($hexs); $k++) {
-                            $chex = str_pad($hexs[$k], 4, "0"); // Add tailing zero
-                            if (isset($transformations[$chex]))
-                                $chex = $transformations[$chex];
-                            $document .= html_entity_decode("&#x".$chex.";");
-                        }
-                        $isHex = false;
-                    break;
-                    case "(":
-                        $plain = "";
-                        $isPlain = true;
-                    break;
-                    case ")":
-                        $document .= $plain;
-                        $isPlain = false;
-                    break;
-                    case "\\":
-                        $c2 = $texts[$i][$j + 1];
-                        if (in_array($c2, array("\\", "(", ")"))) $plain .= $c2;
-                        elseif ($c2 == "n") $plain .= '\n';
-                        elseif ($c2 == "r") $plain .= '\r';
-                        elseif ($c2 == "t") $plain .= '\t';
-                        elseif ($c2 == "b") $plain .= '\b';
-                        elseif ($c2 == "f") $plain .= '\f';
-                        elseif ($c2 >= '0' && $c2 <= '9') {
-                            $oct = preg_replace("#[^0-9]#", "", substr($texts[$i], $j + 1, 3));
-                            $j += strlen($oct) - 1;
-                            $plain .= html_entity_decode("&#".octdec($oct).";", $this->convertquotes);
-                        }
-                        $j++;
-                    break;
-    
-                    default:
-                        if ($isHex)
-                            $hex .= $c;
-                        if ($isPlain)
-                            $plain .= $c;
-                    break;
-                }
-            }
-            $document .= "\n";
-        }
-    
-        return $document;
-    }
-}
-?>
\ No newline at end of file
diff --git a/src/classes/tainacan-creator.php b/src/classes/tainacan-creator.php
index 5c513b00f..964f38ee8 100644
--- a/src/classes/tainacan-creator.php
+++ b/src/classes/tainacan-creator.php
@@ -34,7 +34,6 @@ const DIRS = [
 
 require_once('libs/wp-async-request.php');
 require_once('libs/wp-background-process.php');
-require_once('libs/class-pdf2text.php');
 require_once('class-tainacan-background-process.php');
 require_once('tainacan-utils.php');
 require_once(TAINACAN_IMPORTER_DIR . 'class-tainacan-bg-importer.php');

From 37eb2717cd9c1cae8d20b3a1e8412c0cbc266eb1 Mon Sep 17 00:00:00 2001
From: vnmedeiros <vnicius.nm.ba@gmail.com>
Date: Thu, 19 Sep 2019 16:14:50 -0300
Subject: [PATCH 10/12] add composer.lock #245

---
 composer.lock | 199 ++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 159 insertions(+), 40 deletions(-)

diff --git a/composer.lock b/composer.lock
index 7b39bdf8f..55daf5138 100644
--- a/composer.lock
+++ b/composer.lock
@@ -4,20 +4,20 @@
         "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
         "This file is @generated automatically"
     ],
-    "content-hash": "b5a7a81dd7eed9bc122ee36e472c6430",
+    "content-hash": "e2a614836d1857e45c4f9be57073c2b2",
     "packages": [
         {
             "name": "respect/validation",
-            "version": "1.1.16",
+            "version": "1.1.31",
             "source": {
                 "type": "git",
                 "url": "https://github.com/Respect/Validation.git",
-                "reference": "020ea1ebb5dc626bb7f1958ff49f69172ff589cc"
+                "reference": "45d109fc830644fecc1145200d6351ce4f2769d0"
             },
             "dist": {
                 "type": "zip",
-                "url": "https://api.github.com/repos/Respect/Validation/zipball/020ea1ebb5dc626bb7f1958ff49f69172ff589cc",
-                "reference": "020ea1ebb5dc626bb7f1958ff49f69172ff589cc",
+                "url": "https://api.github.com/repos/Respect/Validation/zipball/45d109fc830644fecc1145200d6351ce4f2769d0",
+                "reference": "45d109fc830644fecc1145200d6351ce4f2769d0",
                 "shasum": ""
             },
             "require": {
@@ -25,9 +25,9 @@
                 "symfony/polyfill-mbstring": "^1.2"
             },
             "require-dev": {
-                "egulias/email-validator": "~1.2",
+                "egulias/email-validator": "~1.2 || ~2.1",
                 "mikey179/vfsstream": "^1.5",
-                "phpunit/phpunit": "~4.0",
+                "phpunit/phpunit": "~4.0 || ~5.0",
                 "symfony/validator": "~2.6.9",
                 "zendframework/zend-validator": "~2.3"
             },
@@ -35,7 +35,7 @@
                 "egulias/email-validator": "Strict (RFC compliant) email validation",
                 "ext-bcmath": "Arbitrary Precision Mathematics",
                 "ext-mbstring": "Multibyte String Functions",
-                "fabpot/php-cs-fixer": "Fix PSR2 and other coding style issues",
+                "friendsofphp/php-cs-fixer": "Fix PSR2 and other coding style issues",
                 "symfony/validator": "Use Symfony validator through Respect\\Validation",
                 "zendframework/zend-validator": "Use Zend Framework validator through Respect\\Validation"
             },
@@ -52,7 +52,7 @@
             },
             "notification-url": "https://packagist.org/downloads/",
             "license": [
-                "BSD Style"
+                "BSD-3-Clause"
             ],
             "authors": [
                 {
@@ -67,20 +67,70 @@
                 "validation",
                 "validator"
             ],
-            "time": "2018-05-19T14:26:44+00:00"
+            "time": "2019-05-28T06:10:06+00:00"
         },
         {
-            "name": "symfony/polyfill-mbstring",
-            "version": "v1.8.0",
+            "name": "smalot/pdfparser",
+            "version": "v0.14.0",
             "source": {
                 "type": "git",
-                "url": "https://github.com/symfony/polyfill-mbstring.git",
-                "reference": "3296adf6a6454a050679cde90f95350ad604b171"
+                "url": "https://github.com/smalot/pdfparser.git",
+                "reference": "ec72a99028ba5e21a0acad92047b85e128cbf81f"
             },
             "dist": {
                 "type": "zip",
-                "url": "https://api.github.com/repos/symfony/polyfill-mbstring/zipball/3296adf6a6454a050679cde90f95350ad604b171",
-                "reference": "3296adf6a6454a050679cde90f95350ad604b171",
+                "url": "https://api.github.com/repos/smalot/pdfparser/zipball/ec72a99028ba5e21a0acad92047b85e128cbf81f",
+                "reference": "ec72a99028ba5e21a0acad92047b85e128cbf81f",
+                "shasum": ""
+            },
+            "require": {
+                "ext-mbstring": "*",
+                "ext-zlib": "*",
+                "php": ">=5.3.0",
+                "tecnickcom/tcpdf": "~6.0"
+            },
+            "require-dev": {
+                "atoum/atoum": "^2.8 | ^3.0"
+            },
+            "type": "library",
+            "autoload": {
+                "psr-0": {
+                    "Smalot\\PdfParser\\": "src/"
+                }
+            },
+            "notification-url": "https://packagist.org/downloads/",
+            "license": [
+                "LGPL-3.0"
+            ],
+            "authors": [
+                {
+                    "name": "Sebastien Malot",
+                    "email": "sebastien@malot.fr"
+                }
+            ],
+            "description": "Pdf parser library. Can read and extract information from pdf file.",
+            "homepage": "http://www.pdfparser.org",
+            "keywords": [
+                "extract",
+                "parse",
+                "parser",
+                "pdf",
+                "text"
+            ],
+            "time": "2019-01-23T09:14:37+00:00"
+        },
+        {
+            "name": "symfony/polyfill-mbstring",
+            "version": "v1.12.0",
+            "source": {
+                "type": "git",
+                "url": "https://github.com/symfony/polyfill-mbstring.git",
+                "reference": "b42a2f66e8f1b15ccf25652c3424265923eb4f17"
+            },
+            "dist": {
+                "type": "zip",
+                "url": "https://api.github.com/repos/symfony/polyfill-mbstring/zipball/b42a2f66e8f1b15ccf25652c3424265923eb4f17",
+                "reference": "b42a2f66e8f1b15ccf25652c3424265923eb4f17",
                 "shasum": ""
             },
             "require": {
@@ -92,7 +142,7 @@
             "type": "library",
             "extra": {
                 "branch-alias": {
-                    "dev-master": "1.8-dev"
+                    "dev-master": "1.12-dev"
                 }
             },
             "autoload": {
@@ -126,22 +176,84 @@
                 "portable",
                 "shim"
             ],
-            "time": "2018-04-26T10:06:28+00:00"
+            "time": "2019-08-06T08:03:45+00:00"
+        },
+        {
+            "name": "tecnickcom/tcpdf",
+            "version": "6.2.26",
+            "source": {
+                "type": "git",
+                "url": "https://github.com/tecnickcom/TCPDF.git",
+                "reference": "367241059ca166e3a76490f4448c284e0a161f15"
+            },
+            "dist": {
+                "type": "zip",
+                "url": "https://api.github.com/repos/tecnickcom/TCPDF/zipball/367241059ca166e3a76490f4448c284e0a161f15",
+                "reference": "367241059ca166e3a76490f4448c284e0a161f15",
+                "shasum": ""
+            },
+            "require": {
+                "php": ">=5.3.0"
+            },
+            "type": "library",
+            "autoload": {
+                "classmap": [
+                    "config",
+                    "include",
+                    "tcpdf.php",
+                    "tcpdf_parser.php",
+                    "tcpdf_import.php",
+                    "tcpdf_barcodes_1d.php",
+                    "tcpdf_barcodes_2d.php",
+                    "include/tcpdf_colors.php",
+                    "include/tcpdf_filters.php",
+                    "include/tcpdf_font_data.php",
+                    "include/tcpdf_fonts.php",
+                    "include/tcpdf_images.php",
+                    "include/tcpdf_static.php",
+                    "include/barcodes/datamatrix.php",
+                    "include/barcodes/pdf417.php",
+                    "include/barcodes/qrcode.php"
+                ]
+            },
+            "notification-url": "https://packagist.org/downloads/",
+            "license": [
+                "LGPL-3.0"
+            ],
+            "authors": [
+                {
+                    "name": "Nicola Asuni",
+                    "email": "info@tecnick.com",
+                    "role": "lead"
+                }
+            ],
+            "description": "TCPDF is a PHP class for generating PDF documents and barcodes.",
+            "homepage": "http://www.tcpdf.org/",
+            "keywords": [
+                "PDFD32000-2008",
+                "TCPDF",
+                "barcodes",
+                "datamatrix",
+                "pdf",
+                "pdf417",
+                "qrcode"
+            ],
+            "time": "2018-10-16T17:24:05+00:00"
         }
     ],
     "packages-dev": [
         {
             "name": "squizlabs/php_codesniffer",
-            "version": "3.2.3",
+            "version": "3.4.2",
             "source": {
                 "type": "git",
                 "url": "https://github.com/squizlabs/PHP_CodeSniffer.git",
-                "reference": "4842476c434e375f9d3182ff7b89059583aa8b27"
+                "reference": "b8a7362af1cc1aadb5bd36c3defc4dda2cf5f0a8"
             },
             "dist": {
                 "type": "zip",
-                "url": "https://api.github.com/repos/squizlabs/PHP_CodeSniffer/zipball/4842476c434e375f9d3182ff7b89059583aa8b27",
-                "reference": "4842476c434e375f9d3182ff7b89059583aa8b27",
+                "url": "https://api.github.com/repos/squizlabs/PHP_CodeSniffer/zipball/b8a7362af1cc1aadb5bd36c3defc4dda2cf5f0a8",
+                "reference": "b8a7362af1cc1aadb5bd36c3defc4dda2cf5f0a8",
                 "shasum": ""
             },
             "require": {
@@ -174,64 +286,71 @@
                 }
             ],
             "description": "PHP_CodeSniffer tokenizes PHP, JavaScript and CSS files and detects violations of a defined set of coding standards.",
-            "homepage": "http://www.squizlabs.com/php-codesniffer",
+            "homepage": "https://github.com/squizlabs/PHP_CodeSniffer",
             "keywords": [
                 "phpcs",
                 "standards"
             ],
-            "time": "2018-02-20T21:35:23+00:00"
+            "time": "2019-04-10T23:49:02+00:00"
         },
         {
             "name": "wimg/php-compatibility",
-            "version": "8.1.0",
+            "version": "9.3.1",
             "source": {
                 "type": "git",
-                "url": "https://github.com/wimg/PHPCompatibility.git",
-                "reference": "4ac01e4fe8faaa4f8d3b3cd06ea92e5418ce472e"
+                "url": "https://github.com/PHPCompatibility/PHPCompatibility.git",
+                "reference": "9999344e47e7af6b00e1a898eacc4e4368fb7196"
             },
             "dist": {
                 "type": "zip",
-                "url": "https://api.github.com/repos/wimg/PHPCompatibility/zipball/4ac01e4fe8faaa4f8d3b3cd06ea92e5418ce472e",
-                "reference": "4ac01e4fe8faaa4f8d3b3cd06ea92e5418ce472e",
+                "url": "https://api.github.com/repos/PHPCompatibility/PHPCompatibility/zipball/9999344e47e7af6b00e1a898eacc4e4368fb7196",
+                "reference": "9999344e47e7af6b00e1a898eacc4e4368fb7196",
                 "shasum": ""
             },
             "require": {
                 "php": ">=5.3",
-                "squizlabs/php_codesniffer": "^2.2 || ^3.0.2"
+                "squizlabs/php_codesniffer": "^2.3 || ^3.0.2"
             },
             "conflict": {
                 "squizlabs/php_codesniffer": "2.6.2"
             },
             "require-dev": {
-                "phpunit/phpunit": "^4.0 || ^5.0 || ^6.0"
+                "phpunit/phpunit": "~4.5 || ^5.0 || ^6.0 || ^7.0"
             },
             "suggest": {
-                "dealerdirect/phpcodesniffer-composer-installer": "^0.4.3"
+                "dealerdirect/phpcodesniffer-composer-installer": "^0.5 || This Composer plugin will sort out the PHPCS 'installed_paths' automatically.",
+                "roave/security-advisories": "dev-master || Helps prevent installing dependencies with known security issues."
             },
             "type": "phpcodesniffer-standard",
-            "autoload": {
-                "psr-4": {
-                    "PHPCompatibility\\": "PHPCompatibility/"
-                }
-            },
             "notification-url": "https://packagist.org/downloads/",
             "license": [
-                "LGPL-3.0"
+                "LGPL-3.0-or-later"
             ],
             "authors": [
                 {
                     "name": "Wim Godden",
+                    "homepage": "https://github.com/wimg",
                     "role": "lead"
+                },
+                {
+                    "name": "Juliette Reinders Folmer",
+                    "homepage": "https://github.com/jrfnl",
+                    "role": "lead"
+                },
+                {
+                    "name": "Contributors",
+                    "homepage": "https://github.com/PHPCompatibility/PHPCompatibility/graphs/contributors"
                 }
             ],
-            "description": "A set of sniffs for PHP_CodeSniffer that checks for PHP version compatibility.",
+            "description": "A set of sniffs for PHP_CodeSniffer that checks for PHP cross-version compatibility.",
             "homepage": "http://techblog.wimgodden.be/tag/codesniffer/",
             "keywords": [
                 "compatibility",
                 "phpcs",
                 "standards"
             ],
-            "time": "2017-12-27T21:58:38+00:00"
+            "abandoned": "phpcompatibility/php-compatibility",
+            "time": "2019-09-05T18:36:49+00:00"
         }
     ],
     "aliases": [],

From c71e4b69b83cf246d8571b3f1719bcad2bf8cfc9 Mon Sep 17 00:00:00 2001
From: vnmedeiros <vnicius.nm.ba@gmail.com>
Date: Thu, 19 Sep 2019 17:14:09 -0300
Subject: [PATCH 11/12] Checks if constant `TAINACAN_CONTENT_PDF_INDEX_ACTIVED`
 exists to active index pdf #245

---
 src/classes/repositories/class-tainacan-items.php | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/classes/repositories/class-tainacan-items.php b/src/classes/repositories/class-tainacan-items.php
index ba4291e45..5ee8ab8b9 100644
--- a/src/classes/repositories/class-tainacan-items.php
+++ b/src/classes/repositories/class-tainacan-items.php
@@ -476,7 +476,9 @@ class Items extends Repository {
 			}
 
 		}
-		$this->generate_index_content( $updated_item );
+		if (defined('TAINACAN_CONTENT_PDF_INDEX_ACTIVED') && TAINACAN_CONTENT_PDF_INDEX_ACTIVED === true) {
+			$this->generate_index_content( $updated_item );
+		}
 	}
 
 	/**

From 4bdfb304e56c2ae80968da6b9c9aa98236cfb41c Mon Sep 17 00:00:00 2001
From: Leo Germani <leogermani@hacklab.com.br>
Date: Fri, 20 Sep 2019 09:10:00 -0300
Subject: [PATCH 12/12] Fix constant check to index pdf #245

---
 src/classes/class-tainacan-media.php | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/classes/class-tainacan-media.php b/src/classes/class-tainacan-media.php
index 5f769c00d..207f1cdb6 100644
--- a/src/classes/class-tainacan-media.php
+++ b/src/classes/class-tainacan-media.php
@@ -233,10 +233,11 @@ class Media {
 	
 	public function index_pdf_content($file, $item_id) {
 		
-		$content_index_meta = '_document_content_index';
-		if (defined('TAINACAN_CONTENT_PDF_INDEX_METADATA')) {
-			$content_index_meta = TAINACAN_CONTENT_PDF_INDEX_METADATA;
+		if ( ! defined('TAINACAN_INDEX_PDF_CONTENT') || true !== TAINACAN_INDEX_PDF_CONTENT ) {
+			return;
 		}
+		
+		$content_index_meta = '_document_content_index';
 
 		if ($file == null) {
 			$meta_id = update_post_meta( $item_id, $content_index_meta, null );