1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
|
<?php
class ExtractTextHTML
{
private $s_text_init;
private $curr;
private $a_tab_final;
public function __construct($text) {
$this -> s_text_init = $text;
}
public function cleanData(){
$this -> curr = $this -> s_text_init;
$this -> HTMLtoISO();
$this -> strip_accents();
$this -> strip_tags();
$this -> strip_ponctuations();
$this -> str_to_lower();
$this -> limit_word(4);
$this -> string_to_array();
}
public function getData($nb_entry){
return array_slice ($this -> a_tab_final, -$nb_entry,$nb_entry);
}
protected function HTMLtoISO(){
$this -> curr = preg_replace('/\<.+?\>/',' ',$this -> curr);
$this -> curr = html_entity_decode($this -> curr,ENT_QUOTES);
}
protected function strip_accents(){
$this -> curr = strtr($this -> curr,'àáâãäçèéêëìíîïñòóôõöùúûüýÿÀÁÂÃÄÇÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝ',
'aaaaaceeeeiiiinooooouuuuyyAAAAACEEEEIIIINOOOOOUUUUY');
}
protected function strip_tags(){
$this -> curr = strip_tags($this -> curr);
}
protected function strip_ponctuations(){
$this -> curr = preg_replace('/[\W]/',' ',$this -> curr);
$this -> curr = preg_replace('/\s+/',' ',$this -> curr);
}
protected function str_to_lower(){
$this -> curr = strtolower($this -> curr);
}
protected function limit_word($nb_car){
$this -> curr = preg_replace('/\s([\w|\d]{0,'.$nb_car.'}\s)+/',' ',$this -> curr);
}
protected function string_to_array(){
$this -> a_tab_final = explode(" ",$this -> curr);
$this -> a_tab_final = array_count_values($this -> a_tab_final);
asort($this -> a_tab_final,SORT_NUMERIC);
}
}
?> |
Partager