[ Index ]

MailPress 7.2

[ Index ]     [ Classes ]     [ Functions ]     [ Variables ]     [ Constants ]     [ Statistics ]    

title

Body

[close]

/mp-includes/class/ -> MP_Html2txt.class.php (source)

   1  <?php
   2  
   3  /*
   4   * Copyright (c) 2005-2007 Jon Abernathy <jon@chuggnutt.com>
   5   *
   6   * This script is free software; you can redistribute it and/or modify
   7   * it under the terms of the GNU General Public License as published by
   8   * the Free Software Foundation; either version 2 of the License, or
   9   * (at your option) any later version.
  10   *
  11   * The GNU General Public License can be found at
  12   * http://www.gnu.org/copyleft/gpl.html.
  13   *
  14   * This script is distributed in the hope that it will be useful,
  15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17   * GNU General Public License for more details.
  18   */
  19  
  20  class MP_Html2txt extends MP_html2txt_
  21  {
  22      const ENCODING = 'UTF-8';
  23  
  24      protected $htmlFuncFlags;
  25  
  26      /**
  27       * Contains the HTML content to convert.
  28       *
  29       * @type string
  30       */
  31      protected $html;
  32  
  33      /**
  34       * Contains the converted, formatted text.
  35       *
  36       * @type string
  37       */
  38      protected $text;
  39  
  40      /**
  41       * List of preg* regular expression patterns to search for,
  42       * used in conjunction with $replace.
  43       *
  44       * @type array
  45       * @see $replace
  46       */
  47      protected $search = array(
  48          "/\r/",                                           // Non-legal carriage return
  49          "/[\n\t]+/",                                      // Newlines and tabs
  50          '/<head\b[^>]*>.*?<\/head>/i',                    // <head>
  51          '/<script\b[^>]*>.*?<\/script>/i',                // <script>s -- which strip_tags supposedly has problems with
  52          '/<style\b[^>]*>.*?<\/style>/i',                  // <style>s -- which strip_tags supposedly has problems with
  53          '/<i\b[^>]*>(.*?)<\/i>/i',                        // <i>
  54          '/<em\b[^>]*>(.*?)<\/em>/i',                      // <em>
  55          '/(<ul\b[^>]*>|<\/ul>)/i',                        // <ul> and </ul>
  56          '/(<ol\b[^>]*>|<\/ol>)/i',                        // <ol> and </ol>
  57          '/(<dl\b[^>]*>|<\/dl>)/i',                        // <dl> and </dl>
  58          '/<li\b[^>]*>(.*?)<\/li>/i',                      // <li> and </li>
  59          '/<dd\b[^>]*>(.*?)<\/dd>/i',                      // <dd> and </dd>
  60          '/<dt\b[^>]*>(.*?)<\/dt>/i',                      // <dt> and </dt>
  61          '/<li\b[^>]*>/i',                                 // <li>
  62          '/<hr\b[^>]*>/i',                                 // <hr>
  63          '/<div\b[^>]*>/i',                                // <div>
  64          '/(<table\b[^>]*>|<\/table>)/i',                  // <table> and </table>
  65          '/(<tr\b[^>]*>|<\/tr>)/i',                        // <tr> and </tr>
  66          '/<td\b[^>]*>(.*?)<\/td>/i',                      // <td> and </td>
  67          '/<span class="_html2text_ignore">.+?<\/span>/i', // <span class="_html2text_ignore">...</span>
  68          '/<(img)\b[^>]*alt=\"([^>"]+)\"[^>]*>/i',         // <img> with alt tag
  69      );
  70  
  71      /**
  72       * List of pattern replacements corresponding to patterns searched.
  73       *
  74       * @type array
  75       * @see $search
  76       */
  77      protected $replace = array(
  78          '',                              // Non-legal carriage return
  79          ' ',                             // Newlines and tabs
  80          '',                              // <head>
  81          '',                              // <script>s -- which strip_tags supposedly has problems with
  82          '',                              // <style>s -- which strip_tags supposedly has problems with
  83          '_\\1_',                         // <i>
  84          '_\\1_',                         // <em>
  85          "\n\n",                          // <ul> and </ul>
  86          "\n\n",                          // <ol> and </ol>
  87          "\n\n",                          // <dl> and </dl>
  88          "\t* \\1\n",                     // <li> and </li>
  89          " \\1\n",                        // <dd> and </dd>
  90          "\t* \\1",                       // <dt> and </dt>
  91          "\n\t* ",                        // <li>
  92          "\n-------------------------\n", // <hr>
  93          "<div>\n",                       // <div>
  94          "\n\n",                          // <table> and </table>
  95          "\n",                            // <tr> and </tr>
  96          "\t\t\\1\n",                     // <td> and </td>
  97          "",                              // <span class="_html2text_ignore">...</span>
  98          '[\\2]',                         // <img> with alt tag
  99      );
 100  
 101      /**
 102       * List of preg* regular expression patterns to search for,
 103       * used in conjunction with $entReplace.
 104       *
 105       * @type array
 106       * @see $entReplace
 107       */
 108      protected $entSearch = array(
 109          '/&#153;/i',                                     // TM symbol in win-1252
 110          '/&#151;/i',                                     // m-dash in win-1252
 111          '/&(amp|#38);/i',                                // Ampersand: see converter()
 112          '/[ ]{2,}/',                                     // Runs of spaces, post-handling
 113      );
 114  
 115      /**
 116       * List of pattern replacements corresponding to patterns searched.
 117       *
 118       * @type array
 119       * @see $entSearch
 120       */
 121      protected $entReplace = array(
 122          '™',         // TM symbol
 123          '—',         // m-dash
 124          '|+|amp|+|', // Ampersand: see converter()
 125          ' ',         // Runs of spaces, post-handling
 126      );
 127  
 128      /**
 129       * List of preg* regular expression patterns to search for
 130       * and replace using callback function.
 131       *
 132       * @type array
 133       */
 134      protected $callbackSearch = array(
 135          '/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i',           // h1 - h6
 136          '/[ ]*<(p)( [^>]*)?>(.*?)<\/p>[ ]*/si',                  // <p> with surrounding whitespace.
 137          '/<(br)[^>]*>[ ]*/i',                                    // <br> with leading whitespace after the newline.
 138          '/<(b)( [^>]*)?>(.*?)<\/b>/i',                           // <b>
 139          '/<(strong)( [^>]*)?>(.*?)<\/strong>/i',                 // <strong>
 140          '/<(th)( [^>]*)?>(.*?)<\/th>/i',                         // <th> and </th>
 141          '/<(a) [^>]*href=("|\')([^"\']+)\2([^>]*)>(.*?)<\/a>/i'  // <a href="">
 142      );
 143  
 144      /**
 145       * List of preg* regular expression patterns to search for in PRE body,
 146       * used in conjunction with $preReplace.
 147       *
 148       * @type array
 149       * @see $preReplace
 150       */
 151      protected $preSearch = array(
 152          "/\n/",
 153          "/\t/",
 154          '/ /',
 155          '/<pre[^>]*>/',
 156          '/<\/pre>/'
 157      );
 158  
 159      /**
 160       * List of pattern replacements corresponding to patterns searched for PRE body.
 161       *
 162       * @type array
 163       * @see $preSearch
 164       */
 165      protected $preReplace = array(
 166          '<br>',
 167          '&nbsp;&nbsp;&nbsp;&nbsp;',
 168          '&nbsp;',
 169          '',
 170          '',
 171      );
 172  
 173      /**
 174       * Temporary workspace used during PRE processing.
 175       *
 176       * @type string
 177       */
 178      protected $preContent = '';
 179  
 180      /**
 181       * Contains the base URL that relative links should resolve to.
 182       *
 183       * @type string
 184       */
 185      protected $baseurl = '';
 186  
 187      /**
 188       * Indicates whether content in the $html variable has been converted yet.
 189       *
 190       * @type boolean
 191       * @see $html, $text
 192       */
 193      protected $converted = false;
 194  
 195      /**
 196       * Contains URL addresses from links to be rendered in plain text.
 197       *
 198       * @type array
 199       * @see buildlinkList()
 200       */
 201      protected $linkList = array();
 202  
 203      /**
 204       * Various configuration options (able to be set in the constructor)
 205       *
 206       * @type array
 207       */
 208      protected $options = array(
 209          'do_links' => 'table', // 'none'
 210                                  // 'inline' (show links inline)
 211                                  // 'nextline' (show links on the next line)
 212                                  // 'table' (if a table of link URLs should be listed after the text.
 213                                  // 'bbcode' (show links as bbcode)
 214  
 215          'width' => 0,          //  Maximum width of the formatted text, in columns.
 216                                  //  Set this value to 0 (or less) to ignore word wrapping
 217                                  //  and not constrain text to a fixed-width column.
 218      );
 219  
 220      private function legacyConstruct($html = '', $fromFile = false, array $options = array())
 221      {
 222          $this->set_html($html, $fromFile);
 223          $this->options = array_merge($this->options, $options);
 224      }
 225  
 226      /**
 227       * @param string $html    Source HTML
 228       * @param array  $options Set configuration options
 229       */
 230      public function __construct($html = '', $options = array())
 231      {
 232          // for backwards compatibility
 233          if (!is_array($options)) {
 234              return call_user_func_array(array($this, 'legacyConstruct'), func_get_args());
 235          }
 236  
 237          $this->html = $html;
 238          $this->options = array_merge($this->options, $options);
 239          $this->htmlFuncFlags = (PHP_VERSION_ID < 50400)
 240              ? ENT_COMPAT
 241              : ENT_COMPAT | ENT_HTML5;
 242      }
 243  
 244      /**
 245      * Get the source HTML
 246      *
 247      * @return string
 248      */
 249      public function getHtml()
 250      {
 251          return $this->html;
 252      }
 253  
 254      /**
 255       * Set the source HTML
 256       *
 257       * @param string $html HTML source content
 258       */
 259      public function setHtml($html)
 260      {
 261          $this->html = $html;
 262          $this->converted = false;
 263      }
 264  
 265      /**
 266       * @deprecated
 267       */
 268      public function set_html($html, $from_file = false)
 269      {
 270          if ($from_file) {
 271              throw new \InvalidArgumentException("Argument from_file no longer supported");
 272          }
 273  
 274          return $this->setHtml($html);
 275      }
 276  
 277      /**
 278       * Returns the text, converted from HTML.
 279       *
 280       * @return string
 281       */
 282      public function getText()
 283      {
 284          if (!$this->converted) {
 285              $this->convert();
 286          }
 287  
 288          return $this->text;
 289      }
 290  
 291      /**
 292       * @deprecated
 293       */
 294      public function get_text()
 295      {
 296          return $this->getText();
 297      }
 298  
 299      /**
 300       * @deprecated
 301       */
 302      public function print_text()
 303      {
 304          print $this->getText();
 305      }
 306  
 307      /**
 308       * @deprecated
 309       */
 310      public function p()
 311      {
 312          return $this->print_text();
 313      }
 314  
 315      /**
 316       * Sets a base URL to handle relative links.
 317       *
 318       * @param string $baseurl
 319       */
 320      public function setBaseUrl($baseurl)
 321      {
 322          $this->baseurl = $baseurl;
 323      }
 324  
 325      /**
 326       * @deprecated
 327       */
 328      public function set_base_url($baseurl)
 329      {
 330          return $this->setBaseUrl($baseurl);
 331      }
 332  
 333      protected function convert()
 334      {
 335         $origEncoding = mb_internal_encoding();
 336         mb_internal_encoding(self::ENCODING);
 337  
 338         $this->doConvert();
 339  
 340         mb_internal_encoding($origEncoding);
 341      }
 342  
 343      protected function doConvert()
 344      {
 345          $this->linkList = array();
 346  
 347          $text = trim($this->html);
 348  
 349          $this->converter($text);
 350  
 351          if ($this->linkList) {
 352              $text .= "\n\n" . '* ~~~~~ *' . "\n";
 353              foreach ($this->linkList as $i => $url) {
 354                  $text .= '[' . ($i + 1) . '] ' . $url . "\n";
 355              }
 356          }
 357  
 358          $this->text = $text;
 359  
 360          $this->converted = true;
 361      }
 362  
 363      protected function converter(&$text)
 364      {
 365          $this->convertBlockquotes($text);
 366          $this->convertPre($text);
 367          $text = preg_replace($this->search, $this->replace, $text);
 368          $text = preg_replace_callback($this->callbackSearch, array($this, 'pregCallback'), $text);
 369          $text = strip_tags($text);
 370          $text = preg_replace($this->entSearch, $this->entReplace, $text);
 371          $text = html_entity_decode($text, $this->htmlFuncFlags, self::ENCODING);
 372  
 373          // Remove unknown/unhandled entities (this cannot be done in search-and-replace block)
 374          $text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text);
 375  
 376          // Convert "|+|amp|+|" into "&", need to be done after handling of unknown entities
 377          // This properly handles situation of "&amp;quot;" in input string
 378          $text = str_replace('|+|amp|+|', '&', $text);
 379  
 380          // Normalise empty lines
 381          $text = preg_replace("/\n\s+\n/", "\n\n", $text);
 382          $text = preg_replace("/[\n]{3,}/", "\n\n", $text);
 383  
 384          // remove leading empty lines (can be produced by eg. P tag on the beginning)
 385          $text = ltrim($text, "\n");
 386  
 387          if ($this->options['width'] > 0) {
 388              $text = wordwrap($text, $this->options['width']);
 389          }
 390      }
 391  
 392      /**
 393       * Helper function called by preg_replace() on link replacement.
 394       *
 395       * Maintains an internal list of links to be displayed at the end of the
 396       * text, with numeric indices to the original point in the text they
 397       * appeared. Also makes an effort at identifying and handling absolute
 398       * and relative links.
 399       *
 400       * @param  string $link          URL of the link
 401       * @param  string $display       Part of the text to associate number with
 402       * @param  null   $linkOverride
 403       * @return string
 404       */
 405      protected function buildlinkList($link, $display, $linkOverride = null)
 406      {
 407          $linkMethod = ($linkOverride) ? $linkOverride : $this->options['do_links'];
 408          if ($linkMethod == 'none') {
 409              return $display;
 410          }
 411  
 412          // Ignored link types
 413          if (preg_match('!^(javascript:|mailto:|#)!i', $link)) {
 414              return $display;
 415          }
 416  
 417          if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) {
 418              $url = $link;
 419          } else {
 420              $url = $this->baseurl;
 421              if (mb_substr($link, 0, 1) != '/') {
 422                  $url .= '/';
 423              }
 424              $url .= $link;
 425          }
 426  
 427          if ($linkMethod == 'table') {
 428              if (($index = array_search($url, $this->linkList)) === false) {
 429                  $index = count($this->linkList);
 430                  $this->linkList[] = $url;
 431              }
 432  
 433              return $display . ' [' . ($index + 1) . ']';
 434          } elseif ($linkMethod == 'nextline') {
 435              return $display . "\n[" . $url . ']';
 436          } elseif ($linkMethod == 'bbcode') {
 437              return sprintf('[url=%s]%s[/url]', $url, $display);
 438          } else { // link_method defaults to inline
 439              return $display . ' [' . $url . ']';
 440          }
 441      }
 442  
 443      protected function convertPre(&$text)
 444      {
 445          // get the content of PRE element
 446          while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) {
 447              // Replace br tags with newlines to prevent the search-and-replace callback from killing whitespace
 448              $this->preContent = preg_replace('/(<br\b[^>]*>)/i', "\n", $matches[1]);
 449  
 450              // Run our defined tags search-and-replace with callback
 451              $this->preContent = preg_replace_callback(
 452                  $this->callbackSearch,
 453                  array($this, 'pregCallback'),
 454                  $this->preContent
 455              );
 456  
 457              // convert the content
 458              $this->preContent = sprintf(
 459                  '<div><br>%s<br></div>',
 460                  preg_replace($this->preSearch, $this->preReplace, $this->preContent)
 461              );
 462  
 463              // replace the content (use callback because content can contain $0 variable)
 464              $text = preg_replace_callback(
 465                  '/<pre[^>]*>.*<\/pre>/ismU',
 466                  array($this, 'pregPreCallback'),
 467                  $text,
 468                  1
 469              );
 470  
 471              // free memory
 472              $this->preContent = '';
 473          }
 474      }
 475  
 476      /**
 477       * Helper function for BLOCKQUOTE body conversion.
 478       *
 479       * @param string $text HTML content
 480       */
 481      protected function convertBlockquotes(&$text)
 482      {
 483          if (preg_match_all('/<\/*blockquote[^>]*>/i', $text, $matches, PREG_OFFSET_CAPTURE)) {
 484              $originalText = $text;
 485              $start = 0;
 486              $taglen = 0;
 487              $level = 0;
 488              $diff = 0;
 489              foreach ($matches[0] as $m) {
 490                  $m[1] = mb_strlen(substr($originalText, 0, $m[1]));
 491                  if ($m[0][0] == '<' && $m[0][1] == '/') {
 492                      $level--;
 493                      if ($level < 0) {
 494                          $level = 0; // malformed HTML: go to next blockquote
 495                      } elseif ($level > 0) {
 496                          // skip inner blockquote
 497                      } else {
 498                          $end = $m[1];
 499                          $len = $end - $taglen - $start;
 500                          // Get blockquote content
 501                          $body = mb_substr($text, $start + $taglen - $diff, $len);
 502  
 503                          // Set text width
 504                          $pWidth = $this->options['width'];
 505                          if ($this->options['width'] > 0) $this->options['width'] -= 2;
 506                          // Convert blockquote content
 507                          $body = trim($body);
 508                          $this->converter($body);
 509                          // Add citation markers and create PRE block
 510                          $body = preg_replace('/((^|\n)>*)/', '\\1> ', trim($body));
 511                          $body = '<pre>' . htmlspecialchars($body, $this->htmlFuncFlags, self::ENCODING) . '</pre>';
 512                          // Re-set text width
 513                          $this->options['width'] = $pWidth;
 514                          // Replace content
 515                          $text = mb_substr($text, 0, $start - $diff)
 516                              . $body
 517                              . mb_substr($text, $end + mb_strlen($m[0]) - $diff);
 518  
 519                          $diff += $len + $taglen + mb_strlen($m[0]) - mb_strlen($body);
 520                          unset($body);
 521                      }
 522                  } else {
 523                      if ($level == 0) {
 524                          $start = $m[1];
 525                          $taglen = mb_strlen($m[0]);
 526                      }
 527                      $level++;
 528                  }
 529              }
 530          }
 531      }
 532  
 533      /**
 534       * Callback function for preg_replace_callback use.
 535       *
 536       * @param  array  $matches PREG matches
 537       * @return string
 538       */
 539      protected function pregCallback($matches)
 540      {
 541          switch (mb_strtolower($matches[1])) {
 542              case 'p':
 543                  // Replace newlines with spaces.
 544                  $para = str_replace("\n", " ", $matches[3]);
 545  
 546                  // Trim trailing and leading whitespace within the tag.
 547                  $para = trim($para);
 548  
 549                  // Add trailing newlines for this para.
 550                  return "\n" . $para . "\n";
 551              case 'br':
 552                  return "\n";
 553              case 'b':
 554              case 'strong':
 555                  return $this->toupper($matches[3]);
 556              case 'th':
 557                  return $this->toupper("\t\t" . $matches[3] . "\n");
 558              case 'h':
 559                  return $this->toupper("\n\n" . $matches[3] . "\n\n");
 560              case 'a':
 561                  // override the link method
 562                  $linkOverride = null;
 563                  if (preg_match('/_html2text_link_(\w+)/', $matches[4], $linkOverrideMatch)) {
 564                      $linkOverride = $linkOverrideMatch[1];
 565                  }
 566                  // Remove spaces in URL (#1487805)
 567                  $url = str_replace(' ', '', $matches[3]);
 568  
 569                  return $this->buildlinkList($url, $matches[5], $linkOverride);
 570          }
 571  
 572          return '';
 573      }
 574  
 575      /**
 576       * Callback function for preg_replace_callback use in PRE content handler.
 577       *
 578       * @param  array  $matches PREG matches
 579       * @return string
 580       */
 581      protected function pregPreCallback(/** @noinspection PhpUnusedParameterInspection */ $matches)
 582      {
 583          return $this->preContent;
 584      }
 585  
 586      /**
 587       * Strtoupper function with HTML tags and entities handling.
 588       *
 589       * @param  string $str Text to convert
 590       * @return string Converted text
 591       */
 592      protected function toupper($str)
 593      {
 594          // string can contain HTML tags
 595          $chunks = preg_split('/(<[^>]*>)/', $str, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
 596  
 597          // convert toupper only the text between HTML tags
 598          foreach ($chunks as $i => $chunk) {
 599              if ($chunk[0] != '<') {
 600                  $chunks[$i] = $this->strtoupper($chunk);
 601              }
 602          }
 603  
 604          return implode($chunks);
 605      }
 606  
 607      /**
 608       * Strtoupper multibyte wrapper function with HTML entities handling.
 609       *
 610       * @param  string $str Text to convert
 611       * @return string Converted text
 612       */
 613      protected function strtoupper($str)
 614      {
 615          $str = html_entity_decode($str, $this->htmlFuncFlags, self::ENCODING);
 616          $str = mb_strtoupper($str);
 617          $str = htmlspecialchars($str, $this->htmlFuncFlags, self::ENCODING);
 618  
 619          return $str;
 620      }
 621  }


Generated: Tue May 19 15:55:14 2020 Cross-referenced by PHPXref 0.7.1