[ Index ]

PHP Cross Reference of Akelos Framework

title

Body

[close]

/AkConverters/ -> AkPdfToText.php (source)

   1  <?php
   2  /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
   3  
   4  // +----------------------------------------------------------------------+
   5  // | Akelos Framework - http://www.akelos.org                             |
   6  // +----------------------------------------------------------------------+
   7  // | Copyright (c) 2002-2006, Akelos Media, S.L.  & Bermi Ferrer Martinez |
   8  // | Released under the GNU Lesser General Public License, see LICENSE.txt|
   9  // +----------------------------------------------------------------------+
  10  
  11  /**
  12   * Converts a PDF into text in order to index it for full text searching
  13   * 
  14   * @package ActiveSupport
  15   * @subpackage Converters
  16   * @author Bermi Ferrer <bermi a.t akelos c.om>
  17   * @copyright Copyright (c) 2002-2006, Akelos Media, S.L. http://www.akelos.org
  18   * @license GNU Lesser General Public License <http://www.gnu.org/copyleft/lesser.html>
  19   */
  20  class AkPdfToText
  21  {
  22  
  23      function extractTextFromPdf($postScriptData)
  24      {
  25          if (!is_string($postScriptData)) {
  26              return '';
  27          }
  28          $text = '';
  29          $postScriptData = str_replace('\)', '##ENDBRACKET##', $postScriptData);
  30          $postScriptData = str_replace('\]', '##ENDSBRACKET##', $postScriptData);
  31          preg_match_all(
  32          '/(T[wdcm*])[\s]*(\[([^\]]*)\]|\(([^\)]*)\))[\s]*Tj/si',
  33          $postScriptData,
  34          $matches
  35          );
  36          for ($i = 0; $i < sizeof($matches[0]); $i++) {
  37              if ($matches[3][$i] != '') {
  38                  preg_match_all('/\(([^)]*)\)/si', $matches[3][$i], $subMatches);
  39                  foreach ($subMatches[1] as $subMatch) {
  40                      $text .= $subMatch;
  41                  }
  42              } else if ($matches[4][$i] != '') {
  43                  $text .= ($matches[1][$i] == 'Tc' ? ' ' : '') . $matches[4][$i];
  44              }
  45          }
  46          $trans = array(
  47          '...'                => '&hellip;',
  48          '\205'                => '&hellip;',
  49          '\221'                => chr(145),
  50          '\222'                => chr(146),
  51          '\223'                => chr(147),
  52          '\224'                => chr(148),
  53          '\363'                => chr(243),
  54          '\226'                => '-',
  55          '\267'                => '&bull;',
  56          '\('                => '(',
  57          '\['                => '[',
  58          '##ENDBRACKET##'    => ')',
  59          '##ENDSBRACKET##'    => ']',
  60          chr(133)            => '-',
  61          chr(141)            => chr(147),
  62          chr(142)            => chr(148),
  63          chr(143)            => chr(145),
  64          chr(144)            => chr(146),
  65          '\032' => chr(136), '\036' => chr(176), '\037' => chr(152), '\041' => chr(33), '\042' => chr(34), '\043' => chr(35), '\044' => chr(36), '\045' => chr(37), '\046' => chr(38), '\047' => chr(39), '\050' => chr(40), '\051' => chr(41), '\052' => chr(42), '\053' => chr(43), '\054' => chr(44), '\055' => chr(45), '\056' => chr(46), '\057' => chr(47), '\061' => chr(49), '\062' => chr(50), '\063' => chr(51), '\064' => chr(52), '\065' => chr(53), '\066' => chr(54), '\067' => chr(55), '\070' => chr(56), '\071' => chr(57), '\072' => chr(58), '\073' => chr(59), '\074' => chr(60), '\075' => chr(61), '\076' => chr(62), '\100' => chr(64), '\101' => chr(65), '\102' => chr(66), '\103' => chr(67), '\104' => chr(68), '\105' => chr(69), '\106' => chr(70), '\107' => chr(71), '\110' => chr(72), '\111' => chr(73), '\112' => chr(74), '\113' => chr(75), '\114' => chr(76), '\115' => chr(77), '\116' => chr(78), '\117' => chr(79), '\120' => chr(80), '\121' => chr(81), '\122' => chr(82), '\123' => chr(83), '\124' => chr(84), '\125' => chr(85), '\126' => chr(86), '\127' => chr(87), '\130' => chr(88), '\131' => chr(89), '\132' => chr(90), '\133' => chr(91), '\134' => chr(92), '\135' => chr(93), '\136' => chr(94), '\137' => chr(95), '\140' => chr(96), '\141' => chr(97), '\142' => chr(98), '\143' => chr(99), '\144' => chr(100), '\145' => chr(101), '\146' => chr(102), '\147' => chr(103), '\150' => chr(104), '\151' => chr(105), '\152' => chr(106), '\153' => chr(107), '\154' => chr(108), '\155' => chr(109), '\156' => chr(110), '\157' => chr(111), '\160' => chr(112), '\161' => chr(113), '\162' => chr(114), '\163' => chr(115), '\164' => chr(116), '\165' => chr(117), '\166' => chr(118), '\167' => chr(119), '\170' => chr(120), '\171' => chr(121), '\173' => chr(123), '\174' => chr(124), '\175' => chr(125), '\176' => chr(126), '\200' => chr(149), '\201' => chr(134), '\202' => chr(135), '\203' => chr(133), '\204' => chr(151), '\205' => chr(150), '\206' => chr(131), '\207' => chr(47), '\210' => chr(139), '\211' => chr(155), '\212' => chr(45), '\213' => chr(137), '\214' => chr(132), '\215' => chr(147), '\216' => chr(148), '\217' => chr(145), '\220' => chr(146), '\221' => chr(130), '\222' => chr(153), '\223' => chr(102), '\224' => chr(102), '\225' => chr(76), '\226' => chr(79), '\227' => chr(138), '\230' => chr(159), '\231' => chr(142), '\232' => chr(105), '\233' => chr(108), '\234' => chr(111), '\235' => chr(154), '\240' => chr(128), '\241' => chr(161), '\242' => chr(162), '\243' => chr(163), '\244' => chr(164), '\246' => chr(166), '\247' => chr(167), '\250' => chr(168), '\251' => chr(169), '\252' => chr(170), '\253' => chr(171), '\254' => chr(172), '\256' => chr(174), '\257' => chr(175), '\260' => chr(176), '\261' => chr(177), '\262' => chr(178), '\263' => chr(179), '\264' => chr(180), '\265' => chr(181), '\266' => chr(182), '\267' => chr(183), '\270' => chr(184), '\271' => chr(185), '\272' => chr(186), '\273' => chr(187), '\274' => chr(188), '\275' => chr(189), '\276' => chr(190), '\277' => chr(191), '\300' => chr(192), '\301' => chr(193), '\302' => chr(194), '\303' => chr(195), '\304' => chr(196), '\305' => chr(197), '\306' => chr(198), '\307' => chr(199), '\310' => chr(200), '\311' => chr(201), '\312' => chr(202), '\313' => chr(203), '\314' => chr(204), '\315' => chr(205), '\316' => chr(206), '\317' => chr(207), '\320' => chr(208), '\321' => chr(209), '\322' => chr(210), '\323' => chr(211), '\324' => chr(212), '\325' => chr(213), '\326' => chr(214), '\327' => chr(215), '\330' => chr(216), '\331' => chr(217), '\332' => chr(218), '\333' => chr(219), '\334' => chr(220), '\335' => chr(221), '\336' => chr(222), '\337' => chr(223), '\340' => chr(224), '\341' => chr(225), '\342' => chr(226), '\343' => chr(227), '\344' => chr(228), '\345' => chr(229), '\346' => chr(230), '\347' => chr(231), '\350' => chr(232), '\351' => chr(233), '\352' => chr(234), '\353' => chr(235), '\354' => chr(236), '\355' => chr(237), '\356' => chr(238), '\357' => chr(239), '\360' => chr(240), '\361' => chr(241), '\362' => chr(242), '\363' => chr(243), '\364' => chr(244), '\365' => chr(245), '\366' => chr(246), '\367' => chr(247), '\370' => chr(248), '\371' => chr(249), '\372' => chr(250), '\373' => chr(251), '\374' => chr(252), '\375' => chr(253), '\376' => chr(254),
  66          );
  67  
  68          return strtr($text, $trans);
  69  
  70      }
  71  
  72      function convert()
  73      {
  74          $searchstart = 'stream';
  75          $searchend = 'endstream';
  76          $pdfText = '';
  77          $pos = 0;
  78          $pos2 = 0;
  79          $startpos = 0;
  80          while ($pos !== false && $pos2 !== false) {
  81              $pos = strpos($this->source, $searchstart, $startpos);
  82              $pos2 = strpos($this->source, $searchend, $startpos + 1);
  83              if ($pos !== false && $pos2 !== false){
  84                  if ($this->source[$pos] == 0x0d && $this->source[$pos + 1] == 0x0a) {
  85                      $pos += 2;
  86                  } else if ($this->source[$pos] == 0x0a) {
  87                      $pos++;
  88                  }
  89                  if ($this->source[$pos2 - 2] == 0x0d && $this->source[$pos2 - 1] == 0x0a) {
  90                      $pos2 -= 2;
  91                  } else if ($this->source[$pos2 - 1] == 0x0a) {
  92                      $pos2--;
  93                  }
  94                  $textsection = substr(
  95                  $this->source,
  96                  $pos + strlen($searchstart) + 2,
  97                  $pos2 - $pos - strlen($searchstart) - 1
  98                  );
  99                  $data = @gzuncompress($textsection);
 100                  $pdfText .= $this->extractTextFromPdf($data);
 101                  $startpos = $pos2 + strlen($searchend) - 1;
 102  
 103              }
 104          }
 105  
 106          return preg_replace('/(\s)+/', ' ', $pdfText);
 107  
 108  
 109      }
 110  
 111  }
 112  
 113  ?>


Generated: Mon Oct 27 12:43:49 2008 Cross-referenced by PHPXref 0.6