Script que faz leitura de um PDF para inserir o conteúdo em banco de dados.

Jakson Fischer · 25 de maio de 2017

Fala galera,

Seguinte...

Estou com um senhor problema aqui... Estou usando um script PHP para ler o conteúdo do PDF, até aí, funciona que é uma beleza, o problema é que o bendito não consegue ler os acentos e caracteres especiais... Tentei inúmeras coisas para solucionar, até que resolvi apelar para os universitários hahahaha

O código que estou usando está abaixo:

<?
class PDF2Text {
	// Some settings
	var $multibyte = 2; // Use setUnicode(TRUE|FALSE)
	var $convertquotes = ENT_QUOTES; // ENT_COMPAT (double-quotes), ENT_QUOTES (Both), ENT_NOQUOTES (None)
	
	// Variables
	var $filename = '';
	var $decodedtext = '';
	
	function setFilename($filename) { 
		// Reset
		$this->decodedtext = '';
		$this->filename = $filename;
	}

	function output($echo = false) { 
		if($echo) echo $this->decodedtext;
		else return $this->decodedtext;
	}

	function setUnicode($input) { 
		// 4 for unicode. But 2 should work in most cases just fine
		if($input == true) $this->multibyte = 4;
		else $this->multibyte = 2;
	}

	function decodePDF() { 
		// Read the data from pdf file
		$infile = @file_get_contents($this->filename, FILE_BINARY); 
		if (empty($infile)) 
			return ""; 
	
		// Get all text data.
		$transformations = array(); 
		$texts = array(); 
	
		// Get the list of all objects.
		preg_match_all("#obj[\n|\r](.*)endobj[\n|\r]#ismU", $infile, $objects); 
		$objects = @$objects[1]; 
	
		// Select objects with streams.
		for ($i = 0; $i < count($objects); $i++) { 
			$currentObject = $objects[$i]; 
	
			// Check if an object includes data stream.
			if (preg_match("#stream[\n|\r](.*)endstream[\n|\r]#ismU", $currentObject, $stream)) { 
				$stream = ltrim($stream[1]); 
	
				// Check object parameters and look for text data. 
				$options = $this->getObjectOptions($currentObject); 
	
				if (!(empty($options["Length1"]) && empty($options["Type"]) && empty($options["Subtype"]))) 
					continue; 
	
				// Hack, length doesnt always seem to be correct
				unset($options["Length"]);
	
				// So, we have text data. Decode it.
				$data = $this->getDecodedStream($stream, $options);  
	
				if (strlen($data)) { 
	                if (preg_match_all("#BT[\n|\r](.*)ET[\n|\r]#ismU", $data, $textContainers)) {
						$textContainers = @$textContainers[1]; 
						$this->getDirtyTexts($texts, $textContainers); 
					} else 
						$this->getCharTransformations($transformations, $data); 
				} 
			} 
		} 
	
		// Analyze text blocks taking into account character transformations and return results. 
		$this->decodedtext = $this->getTextUsingTransformations($texts, $transformations); 
	}


	function decodeAsciiHex($input) {
		$output = "";
	
		$isOdd = true;
		$isComment = false;

	
		for($i = 0, $codeHigh = -1; $i < strlen($input) && $input[$i] != '>'; $i++) {
			$c = $input[$i];
	
			if($isComment) {
				if ($c == '\r' || $c == '\n')
					$isComment = false;
				continue;
			}
	
			switch($c) {
				case '\0': case '\t': case '\r': case '\f': case '\n': case ' ': break;
				case '%': 
					$isComment = true;
				break;
	
				default:
					$code = hexdec($c);
					if($code === 0 && $c != '0')
						return "";
	
					if($isOdd)
						$codeHigh = $code;
					else
						$output .= chr($codeHigh * 16 + $code);
	
					$isOdd = !$isOdd;
				break;
			}
		}
	
		if($input[$i] != '>')
			return "";
	
		if($isOdd)
			$output .= chr($codeHigh * 16);
	
		return $output;
	}
	
	function decodeAscii85($input) {
		$output = "";
	
		$isComment = false;
		$ords = array();
		
		for($i = 0, $state = 0; $i < strlen($input) && $input[$i] != '~'; $i++) {
			$c = $input[$i];
	
			if($isComment) {
				if ($c == '\r' || $c == '\n')
					$isComment = false;
				continue;
			}
	
			if ($c == '\0' || $c == '\t' || $c == '\r' || $c == '\f' || $c == '\n' || $c == ' ')
				continue;
			if ($c == '%') {
				$isComment = true;
				continue;
			}
			if ($c == 'z' && $state === 0) {
				$output .= str_repeat(chr(0), 4);
				continue;
			}
			if ($c < '!' || $c > 'u')
				return "";
	
			$code = ord($input[$i]) & 0xff;
			$ords[$state++] = $code - ord('!');
	
			if ($state == 5) {
				$state = 0;
				for ($sum = 0, $j = 0; $j < 5; $j++)
					$sum = $sum * 85 + $ords[$j];
				for ($j = 3; $j >= 0; $j--)
					$output .= chr($sum >> ($j * 8));
			}
		}
		if ($state === 1)
			return "";
		elseif ($state > 1) {
			for ($i = 0, $sum = 0; $i < $state; $i++)
				$sum += ($ords[$i] + ($i == $state - 1)) * pow(85, 4 - $i);
			for ($i = 0; $i < $state - 1; $i++)
				$ouput .= chr($sum >> ((3 - $i) * 8));
		}
	
		return $output;
	}
	
	function decodeFlate($input) {
		return gzuncompress($input);
	}
	
	function getObjectOptions($object) {
		$options = array();

		if (preg_match("#<<(.*)>>#ismU", $object, $options)) {
			$options = explode("/", $options[1]);
			@array_shift($options);
	
			$o = array();
			for ($j = 0; $j < @count($options); $j++) {
				$options[$j] = preg_replace("#\s+#", " ", trim($options[$j]));
				if (strpos($options[$j], " ") !== false) {
					$parts = explode(" ", $options[$j]);
					$o[$parts[0]] = $parts[1];
				} else
					$o[$options[$j]] = true;
			}
			$options = $o;
			unset($o);
		}
	
		return $options;
	}
	
	function getDecodedStream($stream, $options) {
		$data = "";
		if (empty($options["Filter"]))
			$data = $stream;
		else {
			$length = !empty($options["Length"]) ? $options["Length"] : strlen($stream);
			$_stream = substr($stream, 0, $length);
	
			foreach ($options as $key => $value) {
				if ($key == "ASCIIHexDecode")
					$_stream = $this->decodeAsciiHex($_stream);
				if ($key == "ASCII85Decode")
					$_stream = $this->decodeAscii85($_stream);
				if ($key == "FlateDecode")
					$_stream = $this->decodeFlate($_stream);
				if ($key == "Crypt") { // TO DO
				}
			}
			$data = $_stream;
		}
		return $data;
	}
	function getDirtyTexts(&$texts, $textContainers) {
		
		for ($j = 0; $j < count($textContainers); $j++) {
			if (preg_match_all("#\[(.*)\]\s*TJ[\n|\r]#ismU", $textContainers[$j], $parts))
				$texts = array_merge($texts, @$parts[1]);
			elseif(preg_match_all("#T[d|w|m|f]\s*(\(.*\))\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts))
				$texts = array_merge($texts, @$parts[1]);
			elseif(preg_match_all("#T[d|w|m|f]\s*(\[.*\])\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts))
				$texts = array_merge($texts, @$parts[1]);
		}
	}
	function getCharTransformations(&$transformations, $stream) {
		preg_match_all("#([0-9]+)\s+beginbfchar(.*)endbfchar#ismU", $stream, $chars, PREG_SET_ORDER);
		preg_match_all("#([0-9]+)\s+beginbfrange(.*)endbfrange#ismU", $stream, $ranges, PREG_SET_ORDER);
	
		for ($j = 0; $j < count($chars); $j++) {
			$count = $chars[$j][1];
			$current = explode("\n", trim($chars[$j][2]));
			for ($k = 0; $k < $count && $k < count($current); $k++) {
				if (preg_match("#<([0-9a-f]{2,4})>\s+<([0-9a-f]{4,512})>#is", trim($current[$k]), $map))
					$transformations[str_pad($map[1], 4, "0")] = $map[2];
			}
		}
		for ($j = 0; $j < count($ranges); $j++) {
			$count = $ranges[$j][1];
			$current = explode("\n", trim($ranges[$j][2]));
			for ($k = 0; $k < $count && $k < count($current); $k++) {
				if (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+<([0-9a-f]{4})>#is", trim($current[$k]), $map)) {
					$from = hexdec($map[1]);
					$to = hexdec($map[2]);
					$_from = hexdec($map[3]);
	
					for ($m = $from, $n = 0; $m <= $to; $m++, $n++)
						$transformations[sprintf("%04X", $m)] = sprintf("%04X", $_from + $n);
				} elseif (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+\[(.*)\]#ismU", trim($current[$k]), $map)) {
					$from = hexdec($map[1]);
					$to = hexdec($map[2]);
					$parts = preg_split("#\s+#", trim($map[3]));
					
					for ($m = $from, $n = 0; $m <= $to && $n < count($parts); $m++, $n++)
						$transformations[sprintf("%04X", $m)] = sprintf("%04X", hexdec($parts[$n]));
				}
			}
		}
	}
	function getTextUsingTransformations($texts, $transformations) {
		$document = "";
		for ($i = 0; $i < count($texts); $i++) {
			$isHex = false;
			$isPlain = false;
	
			$hex = "";
			$plain = "";
			for ($j = 0; $j < strlen($texts[$i]); $j++) {
				$c = $texts[$i][$j];
				switch($c) {
					case "<":
						$hex = "";
						$isHex = true;
					break;
					case ">":
						$hexs = str_split($hex, $this->multibyte); // 2 or 4 (UTF8 or ISO)
						for ($k = 0; $k < count($hexs); $k++) {
							$chex = str_pad($hexs[$k], 4, "0"); // Add tailing zero
							if (isset($transformations[$chex]))
								$chex = $transformations[$chex];
							$document .= html_entity_decode("&#x".$chex.";");
						}
						$isHex = false;
					break;
					case "(":
						$plain = "";
						$isPlain = true;
					break;
					case ")":
						$document .= $plain;
						$isPlain = false;
					break;
					case "\\":
						$c2 = $texts[$i][$j + 1];
						if (in_array($c2, array("\\", "(", ")"))) $plain .= $c2;
						elseif ($c2 == "n") $plain .= '\n';
						elseif ($c2 == "r") $plain .= '\r';
						elseif ($c2 == "t") $plain .= '\t';
						elseif ($c2 == "b") $plain .= '\b';
						elseif ($c2 == "f") $plain .= '\f';
						elseif ($c2 >= '0' && $c2 <= '9') {
							$oct = preg_replace("#[^0-9]#", "", substr($texts[$i], $j + 1, 3));
							$j += strlen($oct) - 1;
							$plain .= html_entity_decode("&#".octdec($oct).";", $this->convertquotes);
						}
						$j++;
					break;
	
					default:
						if ($isHex)
							$hex .= $c;
						if ($isPlain)
							$plain .= $c;
					break;
				}
			}
			$document .= "\n";
		}
	
		return $document;
		
	}
}
?>

Para chamar ele estou usando o código abaixo:

<?

include('class.pdf2text.php');

$arquivo = "./upload/Pagina 1-1.pdf";

$a = new PDF2Text();
$a->setFilename($arquivo);
$a->decodePDF();
echo $a->output();

?>

Até aí, maravilha, tudo que não tenha caractere especial ou acento, vai que vai...

Meu problema começa quando ele encontra uma palavra acentuada... Do tipo Coração... O meu quase para, porque ele escreve Cora��o...

Tipo assim:

Tem como alguém me dar uma ajuda em relação a isso?

Preciso apenas que ele leia e retorne os valores acentuados normalmente...

Tentei usar uma função para trocar tudo que estivesse acentuado, mas não funcionou...

DiF · 25 de maio de 2017

@Jakson Fischer Já tentou usar o utf8_decode() ou o utf8_encode() no php?

Isso é problema de codificação. No seu sistema, sete o utf-8 como charset, o pdf também precisa estar em utf-8... o banco de dados a mesma coisa.

Jakson Fischer · 25 de maio de 2017

4 horas atrás, DiF disse:

@Jakson Fischer Já tentou usar o utf8_decode() ou o utf8_encode() no php?

Isso é problema de codificação. No seu sistema, sete o utf-8 como charset, o pdf também precisa estar em utf-8... o banco de dados a mesma coisa.

@DiF

Não consegui, o PDF está em UTF-8, sistema, banco, tudo, mas ele não consegue retornar os caracteres especias e nem os acentos :/

Carlos Zanon · 25 de maio de 2017

Tenta adicionar (*UTF8) no começo das expressões regulares... tipo:

preg_match("(*UTF8)#stream[\n|\r](.*)endstream[\n|\r]#ismU"

Me recordo de ter resolvi problemas assim antigamente :v

Em outras situações usei o essa aqui http://php.net/manual/en/function.mb-internal-encoding.php junto com os preg_match

tenta usar o mb_internal_encoding primeiro, se não me engano é o que melhor funciona ;P

@edit

mb_internal_encoding('UTF8');
preg_match("#stream[\n|\r](.*)endstream[\n|\r]#ismU"

Jakson Fischer · 26 de maio de 2017

Então, acho que o meu maior problema é que eu estou iniciando nesse mundo do PHP, não sei onde colocar isso tudo que me disseram, podem me ajudar em relação a colocar isso tudo dentro deste código?

Carlos Zanon · 27 de maio de 2017

Em 25/05/2017 às 16:25, Jakson Fischer disse:

<? class PDF2Text { // Some settings

coloca logo após o <? aquele mb_internal, deve funcionar

Jakson Fischer · 29 de maio de 2017

Em 2017-5-26 às 19:51, Carlos Zanon disse:

coloca logo após o <? aquele mb_internal, deve funcionar

Coloquei como disse o seguinte código:

mb_internal_encoding("UTF-8");
//mb_internal_encoding('UTF-8');
mb_http_output('UTF-8');
mb_http_input('UTF-8');
mb_language('uni');
mb_regex_encoding('UTF-8');
ob_start('mb_output_handler');