tree = new DOMDocument(); $start = microtime(true); if (!@$this->tree->loadHTML($text)) return false; $root = $this->tree->documentElement; $start = microtime(true); $this->HeuristicRemove($root, ( ($ratio == null) || ($min_len == null) )); if ($ratio == null) { $this->total_unlinked_text = $this->sanitize_text($this->total_unlinked_text); $words = preg_split('/[\s\r\n\t\|?!.,]+/', $this->total_unlinked_text); $words = array_filter($words); $this->total_unlinked_words = count($words); unset($words); if ($this->total_unlinked_words>0) { $this->link_text_ratio = $this->total_links / $this->total_unlinked_words;// + 0.01; $this->link_text_ratio *= 1.3; } } else { $this->link_text_ratio = $ratio; }; if ($min_len == null) { $this->min_text_len = strlen($this->total_unlinked_text)/$this->text_blocks; } else { $this->min_text_len = $min_len; } $start = microtime(true); $this->ContainerRemove($root); return $this->tree->saveHTML(); } function HeuristicRemove($node, $do_stats = false){ if (in_array($node->nodeName, $this->removed_tags)){ return true; }; if ($do_stats) { if ($node->nodeName == 'a') { $this->total_links++; } $found_text = false; }; $nodes_to_remove = array(); if ($node->hasChildNodes()){ foreach($node->childNodes as $child){ if ($this->HeuristicRemove($child, $do_stats)) { $nodes_to_remove[] = $child; } else if ( $do_stats && ($node->nodeName != 'a') && ($child->nodeName == '#text') ) { $this->total_unlinked_text .= $child->wholeText; if (!$found_text){ $this->text_blocks++; $found_text=true; } }; } foreach ($nodes_to_remove as $child){ $node->removeChild($child); } } return false; } function ContainerRemove($node){ if (is_null($node)) return 0; $link_cnt = 0; $word_cnt = 0; $text_len = 0; $delete = false; $my_text = ''; $ratio = 1; $nodes_to_remove = array(); if ($node->hasChildNodes()){ foreach($node->childNodes as $child){ $data = $this->ContainerRemove($child); if ($data['delete']) { $nodes_to_remove[]=$child; } else { $text_len += $data[2]; } $link_cnt += $data[0]; if ($child->nodeName == 'a') { $link_cnt++; } else { if ($child->nodeName == '#text') $my_text .= $child->wholeText; $word_cnt += $data[1]; } } foreach ($nodes_to_remove as $child){ $node->removeChild($child); } $my_text = $this->sanitize_text($my_text); $words = preg_split('/[\s\r\n\t\|?!.,\[\]]+/', $my_text); $words = array_filter($words); $word_cnt += count($words); $text_len += strlen($my_text); }; if (in_array($node->nodeName, $this->container_tags)){ if ($word_cnt>0) $ratio = $link_cnt/$word_cnt; if ($ratio > $this->link_text_ratio){ $delete = true; } if ( !in_array($node->nodeName, $this->ignore_len_tags) ) { if ( ($text_len < $this->min_text_len) || ($word_cnt<$this->min_words) ) { $delete = true; } } } return array($link_cnt, $word_cnt, $text_len, 'delete' => $delete); } } /**************************** Simple usage example *****************************/ $html = file_get_contents('../rene.html'); $extractor = new ContentExtractor(); $content = $extractor->extract($html); echo $content; ?>