[ Index ] |
MailPress 7.2 |
[ Index ] [ Classes ] [ Functions ] [ Variables ] [ Constants ] [ Statistics ] |
[Summary view] [Print] [Text view]
1 <?php 2 3 /* 4 * Copyright (c) 2005-2007 Jon Abernathy <jon@chuggnutt.com> 5 * 6 * This script is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 * The GNU General Public License can be found at 12 * http://www.gnu.org/copyleft/gpl.html. 13 * 14 * This script is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 */ 19 20 class MP_Html2txt extends MP_html2txt_ 21 { 22 const ENCODING = 'UTF-8'; 23 24 protected $htmlFuncFlags; 25 26 /** 27 * Contains the HTML content to convert. 28 * 29 * @type string 30 */ 31 protected $html; 32 33 /** 34 * Contains the converted, formatted text. 35 * 36 * @type string 37 */ 38 protected $text; 39 40 /** 41 * List of preg* regular expression patterns to search for, 42 * used in conjunction with $replace. 43 * 44 * @type array 45 * @see $replace 46 */ 47 protected $search = array( 48 "/\r/", // Non-legal carriage return 49 "/[\n\t]+/", // Newlines and tabs 50 '/<head\b[^>]*>.*?<\/head>/i', // <head> 51 '/<script\b[^>]*>.*?<\/script>/i', // <script>s -- which strip_tags supposedly has problems with 52 '/<style\b[^>]*>.*?<\/style>/i', // <style>s -- which strip_tags supposedly has problems with 53 '/<i\b[^>]*>(.*?)<\/i>/i', // <i> 54 '/<em\b[^>]*>(.*?)<\/em>/i', // <em> 55 '/(<ul\b[^>]*>|<\/ul>)/i', // <ul> and </ul> 56 '/(<ol\b[^>]*>|<\/ol>)/i', // <ol> and </ol> 57 '/(<dl\b[^>]*>|<\/dl>)/i', // <dl> and </dl> 58 '/<li\b[^>]*>(.*?)<\/li>/i', // <li> and </li> 59 '/<dd\b[^>]*>(.*?)<\/dd>/i', // <dd> and </dd> 60 '/<dt\b[^>]*>(.*?)<\/dt>/i', // <dt> and </dt> 61 '/<li\b[^>]*>/i', // <li> 62 '/<hr\b[^>]*>/i', // <hr> 63 '/<div\b[^>]*>/i', // <div> 64 '/(<table\b[^>]*>|<\/table>)/i', // <table> and </table> 65 '/(<tr\b[^>]*>|<\/tr>)/i', // <tr> and </tr> 66 '/<td\b[^>]*>(.*?)<\/td>/i', // <td> and </td> 67 '/<span class="_html2text_ignore">.+?<\/span>/i', // <span class="_html2text_ignore">...</span> 68 '/<(img)\b[^>]*alt=\"([^>"]+)\"[^>]*>/i', // <img> with alt tag 69 ); 70 71 /** 72 * List of pattern replacements corresponding to patterns searched. 73 * 74 * @type array 75 * @see $search 76 */ 77 protected $replace = array( 78 '', // Non-legal carriage return 79 ' ', // Newlines and tabs 80 '', // <head> 81 '', // <script>s -- which strip_tags supposedly has problems with 82 '', // <style>s -- which strip_tags supposedly has problems with 83 '_\\1_', // <i> 84 '_\\1_', // <em> 85 "\n\n", // <ul> and </ul> 86 "\n\n", // <ol> and </ol> 87 "\n\n", // <dl> and </dl> 88 "\t* \\1\n", // <li> and </li> 89 " \\1\n", // <dd> and </dd> 90 "\t* \\1", // <dt> and </dt> 91 "\n\t* ", // <li> 92 "\n-------------------------\n", // <hr> 93 "<div>\n", // <div> 94 "\n\n", // <table> and </table> 95 "\n", // <tr> and </tr> 96 "\t\t\\1\n", // <td> and </td> 97 "", // <span class="_html2text_ignore">...</span> 98 '[\\2]', // <img> with alt tag 99 ); 100 101 /** 102 * List of preg* regular expression patterns to search for, 103 * used in conjunction with $entReplace. 104 * 105 * @type array 106 * @see $entReplace 107 */ 108 protected $entSearch = array( 109 '/™/i', // TM symbol in win-1252 110 '/—/i', // m-dash in win-1252 111 '/&(amp|#38);/i', // Ampersand: see converter() 112 '/[ ]{2,}/', // Runs of spaces, post-handling 113 ); 114 115 /** 116 * List of pattern replacements corresponding to patterns searched. 117 * 118 * @type array 119 * @see $entSearch 120 */ 121 protected $entReplace = array( 122 '™', // TM symbol 123 '—', // m-dash 124 '|+|amp|+|', // Ampersand: see converter() 125 ' ', // Runs of spaces, post-handling 126 ); 127 128 /** 129 * List of preg* regular expression patterns to search for 130 * and replace using callback function. 131 * 132 * @type array 133 */ 134 protected $callbackSearch = array( 135 '/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i', // h1 - h6 136 '/[ ]*<(p)( [^>]*)?>(.*?)<\/p>[ ]*/si', // <p> with surrounding whitespace. 137 '/<(br)[^>]*>[ ]*/i', // <br> with leading whitespace after the newline. 138 '/<(b)( [^>]*)?>(.*?)<\/b>/i', // <b> 139 '/<(strong)( [^>]*)?>(.*?)<\/strong>/i', // <strong> 140 '/<(th)( [^>]*)?>(.*?)<\/th>/i', // <th> and </th> 141 '/<(a) [^>]*href=("|\')([^"\']+)\2([^>]*)>(.*?)<\/a>/i' // <a href=""> 142 ); 143 144 /** 145 * List of preg* regular expression patterns to search for in PRE body, 146 * used in conjunction with $preReplace. 147 * 148 * @type array 149 * @see $preReplace 150 */ 151 protected $preSearch = array( 152 "/\n/", 153 "/\t/", 154 '/ /', 155 '/<pre[^>]*>/', 156 '/<\/pre>/' 157 ); 158 159 /** 160 * List of pattern replacements corresponding to patterns searched for PRE body. 161 * 162 * @type array 163 * @see $preSearch 164 */ 165 protected $preReplace = array( 166 '<br>', 167 ' ', 168 ' ', 169 '', 170 '', 171 ); 172 173 /** 174 * Temporary workspace used during PRE processing. 175 * 176 * @type string 177 */ 178 protected $preContent = ''; 179 180 /** 181 * Contains the base URL that relative links should resolve to. 182 * 183 * @type string 184 */ 185 protected $baseurl = ''; 186 187 /** 188 * Indicates whether content in the $html variable has been converted yet. 189 * 190 * @type boolean 191 * @see $html, $text 192 */ 193 protected $converted = false; 194 195 /** 196 * Contains URL addresses from links to be rendered in plain text. 197 * 198 * @type array 199 * @see buildlinkList() 200 */ 201 protected $linkList = array(); 202 203 /** 204 * Various configuration options (able to be set in the constructor) 205 * 206 * @type array 207 */ 208 protected $options = array( 209 'do_links' => 'table', // 'none' 210 // 'inline' (show links inline) 211 // 'nextline' (show links on the next line) 212 // 'table' (if a table of link URLs should be listed after the text. 213 // 'bbcode' (show links as bbcode) 214 215 'width' => 0, // Maximum width of the formatted text, in columns. 216 // Set this value to 0 (or less) to ignore word wrapping 217 // and not constrain text to a fixed-width column. 218 ); 219 220 private function legacyConstruct($html = '', $fromFile = false, array $options = array()) 221 { 222 $this->set_html($html, $fromFile); 223 $this->options = array_merge($this->options, $options); 224 } 225 226 /** 227 * @param string $html Source HTML 228 * @param array $options Set configuration options 229 */ 230 public function __construct($html = '', $options = array()) 231 { 232 // for backwards compatibility 233 if (!is_array($options)) { 234 return call_user_func_array(array($this, 'legacyConstruct'), func_get_args()); 235 } 236 237 $this->html = $html; 238 $this->options = array_merge($this->options, $options); 239 $this->htmlFuncFlags = (PHP_VERSION_ID < 50400) 240 ? ENT_COMPAT 241 : ENT_COMPAT | ENT_HTML5; 242 } 243 244 /** 245 * Get the source HTML 246 * 247 * @return string 248 */ 249 public function getHtml() 250 { 251 return $this->html; 252 } 253 254 /** 255 * Set the source HTML 256 * 257 * @param string $html HTML source content 258 */ 259 public function setHtml($html) 260 { 261 $this->html = $html; 262 $this->converted = false; 263 } 264 265 /** 266 * @deprecated 267 */ 268 public function set_html($html, $from_file = false) 269 { 270 if ($from_file) { 271 throw new \InvalidArgumentException("Argument from_file no longer supported"); 272 } 273 274 return $this->setHtml($html); 275 } 276 277 /** 278 * Returns the text, converted from HTML. 279 * 280 * @return string 281 */ 282 public function getText() 283 { 284 if (!$this->converted) { 285 $this->convert(); 286 } 287 288 return $this->text; 289 } 290 291 /** 292 * @deprecated 293 */ 294 public function get_text() 295 { 296 return $this->getText(); 297 } 298 299 /** 300 * @deprecated 301 */ 302 public function print_text() 303 { 304 print $this->getText(); 305 } 306 307 /** 308 * @deprecated 309 */ 310 public function p() 311 { 312 return $this->print_text(); 313 } 314 315 /** 316 * Sets a base URL to handle relative links. 317 * 318 * @param string $baseurl 319 */ 320 public function setBaseUrl($baseurl) 321 { 322 $this->baseurl = $baseurl; 323 } 324 325 /** 326 * @deprecated 327 */ 328 public function set_base_url($baseurl) 329 { 330 return $this->setBaseUrl($baseurl); 331 } 332 333 protected function convert() 334 { 335 $origEncoding = mb_internal_encoding(); 336 mb_internal_encoding(self::ENCODING); 337 338 $this->doConvert(); 339 340 mb_internal_encoding($origEncoding); 341 } 342 343 protected function doConvert() 344 { 345 $this->linkList = array(); 346 347 $text = trim($this->html); 348 349 $this->converter($text); 350 351 if ($this->linkList) { 352 $text .= "\n\n" . '* ~~~~~ *' . "\n"; 353 foreach ($this->linkList as $i => $url) { 354 $text .= '[' . ($i + 1) . '] ' . $url . "\n"; 355 } 356 } 357 358 $this->text = $text; 359 360 $this->converted = true; 361 } 362 363 protected function converter(&$text) 364 { 365 $this->convertBlockquotes($text); 366 $this->convertPre($text); 367 $text = preg_replace($this->search, $this->replace, $text); 368 $text = preg_replace_callback($this->callbackSearch, array($this, 'pregCallback'), $text); 369 $text = strip_tags($text); 370 $text = preg_replace($this->entSearch, $this->entReplace, $text); 371 $text = html_entity_decode($text, $this->htmlFuncFlags, self::ENCODING); 372 373 // Remove unknown/unhandled entities (this cannot be done in search-and-replace block) 374 $text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text); 375 376 // Convert "|+|amp|+|" into "&", need to be done after handling of unknown entities 377 // This properly handles situation of "&quot;" in input string 378 $text = str_replace('|+|amp|+|', '&', $text); 379 380 // Normalise empty lines 381 $text = preg_replace("/\n\s+\n/", "\n\n", $text); 382 $text = preg_replace("/[\n]{3,}/", "\n\n", $text); 383 384 // remove leading empty lines (can be produced by eg. P tag on the beginning) 385 $text = ltrim($text, "\n"); 386 387 if ($this->options['width'] > 0) { 388 $text = wordwrap($text, $this->options['width']); 389 } 390 } 391 392 /** 393 * Helper function called by preg_replace() on link replacement. 394 * 395 * Maintains an internal list of links to be displayed at the end of the 396 * text, with numeric indices to the original point in the text they 397 * appeared. Also makes an effort at identifying and handling absolute 398 * and relative links. 399 * 400 * @param string $link URL of the link 401 * @param string $display Part of the text to associate number with 402 * @param null $linkOverride 403 * @return string 404 */ 405 protected function buildlinkList($link, $display, $linkOverride = null) 406 { 407 $linkMethod = ($linkOverride) ? $linkOverride : $this->options['do_links']; 408 if ($linkMethod == 'none') { 409 return $display; 410 } 411 412 // Ignored link types 413 if (preg_match('!^(javascript:|mailto:|#)!i', $link)) { 414 return $display; 415 } 416 417 if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) { 418 $url = $link; 419 } else { 420 $url = $this->baseurl; 421 if (mb_substr($link, 0, 1) != '/') { 422 $url .= '/'; 423 } 424 $url .= $link; 425 } 426 427 if ($linkMethod == 'table') { 428 if (($index = array_search($url, $this->linkList)) === false) { 429 $index = count($this->linkList); 430 $this->linkList[] = $url; 431 } 432 433 return $display . ' [' . ($index + 1) . ']'; 434 } elseif ($linkMethod == 'nextline') { 435 return $display . "\n[" . $url . ']'; 436 } elseif ($linkMethod == 'bbcode') { 437 return sprintf('[url=%s]%s[/url]', $url, $display); 438 } else { // link_method defaults to inline 439 return $display . ' [' . $url . ']'; 440 } 441 } 442 443 protected function convertPre(&$text) 444 { 445 // get the content of PRE element 446 while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) { 447 // Replace br tags with newlines to prevent the search-and-replace callback from killing whitespace 448 $this->preContent = preg_replace('/(<br\b[^>]*>)/i', "\n", $matches[1]); 449 450 // Run our defined tags search-and-replace with callback 451 $this->preContent = preg_replace_callback( 452 $this->callbackSearch, 453 array($this, 'pregCallback'), 454 $this->preContent 455 ); 456 457 // convert the content 458 $this->preContent = sprintf( 459 '<div><br>%s<br></div>', 460 preg_replace($this->preSearch, $this->preReplace, $this->preContent) 461 ); 462 463 // replace the content (use callback because content can contain $0 variable) 464 $text = preg_replace_callback( 465 '/<pre[^>]*>.*<\/pre>/ismU', 466 array($this, 'pregPreCallback'), 467 $text, 468 1 469 ); 470 471 // free memory 472 $this->preContent = ''; 473 } 474 } 475 476 /** 477 * Helper function for BLOCKQUOTE body conversion. 478 * 479 * @param string $text HTML content 480 */ 481 protected function convertBlockquotes(&$text) 482 { 483 if (preg_match_all('/<\/*blockquote[^>]*>/i', $text, $matches, PREG_OFFSET_CAPTURE)) { 484 $originalText = $text; 485 $start = 0; 486 $taglen = 0; 487 $level = 0; 488 $diff = 0; 489 foreach ($matches[0] as $m) { 490 $m[1] = mb_strlen(substr($originalText, 0, $m[1])); 491 if ($m[0][0] == '<' && $m[0][1] == '/') { 492 $level--; 493 if ($level < 0) { 494 $level = 0; // malformed HTML: go to next blockquote 495 } elseif ($level > 0) { 496 // skip inner blockquote 497 } else { 498 $end = $m[1]; 499 $len = $end - $taglen - $start; 500 // Get blockquote content 501 $body = mb_substr($text, $start + $taglen - $diff, $len); 502 503 // Set text width 504 $pWidth = $this->options['width']; 505 if ($this->options['width'] > 0) $this->options['width'] -= 2; 506 // Convert blockquote content 507 $body = trim($body); 508 $this->converter($body); 509 // Add citation markers and create PRE block 510 $body = preg_replace('/((^|\n)>*)/', '\\1> ', trim($body)); 511 $body = '<pre>' . htmlspecialchars($body, $this->htmlFuncFlags, self::ENCODING) . '</pre>'; 512 // Re-set text width 513 $this->options['width'] = $pWidth; 514 // Replace content 515 $text = mb_substr($text, 0, $start - $diff) 516 . $body 517 . mb_substr($text, $end + mb_strlen($m[0]) - $diff); 518 519 $diff += $len + $taglen + mb_strlen($m[0]) - mb_strlen($body); 520 unset($body); 521 } 522 } else { 523 if ($level == 0) { 524 $start = $m[1]; 525 $taglen = mb_strlen($m[0]); 526 } 527 $level++; 528 } 529 } 530 } 531 } 532 533 /** 534 * Callback function for preg_replace_callback use. 535 * 536 * @param array $matches PREG matches 537 * @return string 538 */ 539 protected function pregCallback($matches) 540 { 541 switch (mb_strtolower($matches[1])) { 542 case 'p': 543 // Replace newlines with spaces. 544 $para = str_replace("\n", " ", $matches[3]); 545 546 // Trim trailing and leading whitespace within the tag. 547 $para = trim($para); 548 549 // Add trailing newlines for this para. 550 return "\n" . $para . "\n"; 551 case 'br': 552 return "\n"; 553 case 'b': 554 case 'strong': 555 return $this->toupper($matches[3]); 556 case 'th': 557 return $this->toupper("\t\t" . $matches[3] . "\n"); 558 case 'h': 559 return $this->toupper("\n\n" . $matches[3] . "\n\n"); 560 case 'a': 561 // override the link method 562 $linkOverride = null; 563 if (preg_match('/_html2text_link_(\w+)/', $matches[4], $linkOverrideMatch)) { 564 $linkOverride = $linkOverrideMatch[1]; 565 } 566 // Remove spaces in URL (#1487805) 567 $url = str_replace(' ', '', $matches[3]); 568 569 return $this->buildlinkList($url, $matches[5], $linkOverride); 570 } 571 572 return ''; 573 } 574 575 /** 576 * Callback function for preg_replace_callback use in PRE content handler. 577 * 578 * @param array $matches PREG matches 579 * @return string 580 */ 581 protected function pregPreCallback(/** @noinspection PhpUnusedParameterInspection */ $matches) 582 { 583 return $this->preContent; 584 } 585 586 /** 587 * Strtoupper function with HTML tags and entities handling. 588 * 589 * @param string $str Text to convert 590 * @return string Converted text 591 */ 592 protected function toupper($str) 593 { 594 // string can contain HTML tags 595 $chunks = preg_split('/(<[^>]*>)/', $str, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE); 596 597 // convert toupper only the text between HTML tags 598 foreach ($chunks as $i => $chunk) { 599 if ($chunk[0] != '<') { 600 $chunks[$i] = $this->strtoupper($chunk); 601 } 602 } 603 604 return implode($chunks); 605 } 606 607 /** 608 * Strtoupper multibyte wrapper function with HTML entities handling. 609 * 610 * @param string $str Text to convert 611 * @return string Converted text 612 */ 613 protected function strtoupper($str) 614 { 615 $str = html_entity_decode($str, $this->htmlFuncFlags, self::ENCODING); 616 $str = mb_strtoupper($str); 617 $str = htmlspecialchars($str, $this->htmlFuncFlags, self::ENCODING); 618 619 return $str; 620 } 621 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Tue May 19 15:55:14 2020 | Cross-referenced by PHPXref 0.7.1 |