1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
|
#include "html_parser.h"
#include "html_stream.h"
#include <tr1/regex>
using namespace std::tr1;
#include <iostream>
using namespace std;
#include <cstring>
#include <set>
void add_href( std::string url )
{
size_t pos = url.find( '#' );
if ( pos != string::npos )
{
url.resize( pos );
}
if ( url.empty() )
{
return;
}
static set<string> urls;
if ( urls.find( url ) == urls.end() )
{
urls.insert( url );
cout << "URL: " << url << "\n";
}
}
void add_image_src( const std::string & url )
{
static set<string> urls;
if ( urls.find( url ) == urls.end() )
{
urls.insert( url );
cout << "IMAGE: " << url << "\n";
}
}
void handle_attribute( std::string tag_name, std::string attr_name, const std::string & attr_value )
{
// change to lower case
std::transform( tag_name.begin(), tag_name.end(), tag_name.begin(), tolower );
std::transform( attr_name.begin(), attr_name.end(), attr_name.begin(), tolower );
if ( tag_name == "a" && attr_name == "href" )
{
add_href( attr_value );
}
else if ( tag_name == "img" && attr_name == "src" )
{
add_image_src( attr_value );
}
else if ( tag_name == "link" && attr_name == "href" )
{
add_href( attr_value );
}
}
void parse_attributes_of_tag( std::string::const_iterator begin, std::string::const_iterator end )
{
if ( *begin != '<' )
{
throw std::invalid_argument( "Not the begining of an HTML tag" );
}
//cout << "Parsing {" << string( begin, end ) << "}\n";
static regex tag_with_attr_pattern( "<(\\s*)(\\w+)(.*=.*)>" );
boost::smatch r;
if ( regex_search( begin, end, r, tag_with_attr_pattern ) )
{
// found a tag with at least one attribute
string tag_name = r[2];
string attributes = r[3];
std::string::const_iterator begin = attributes.begin();
std::string::const_iterator end = attributes.end();
while ( begin != end )
{
// extract 1 attribute
static regex single_quoted_attr_extraction_pattern( "\\s*(\\w+)\\s*=\\s*'([^']*)'" );
static regex double_quoted_attr_extraction_pattern( "\\s*(\\w+)\\s*=\\s*\"([^\"]*)\"" );
static regex unquoted_attr_extraction_pattern( "\\s*(\\w+)\\s*=\\s*(\\w+)" );
boost::smatch r;
if ( regex_search( begin, end, r, single_quoted_attr_extraction_pattern ) )
{
handle_attribute( tag_name, r[1], r[2] );
}
else if ( regex_search( begin, end, r, double_quoted_attr_extraction_pattern ) )
{
handle_attribute( tag_name, r[1], r[2] );
}
else if ( regex_search( begin, end, r, unquoted_attr_extraction_pattern ) )
{
handle_attribute( tag_name, r[1], r[2] );
}
else
{
break;
}
// advance in string
begin += r.length();
}
}
}
void html_parser::parse_stream( html_stream & stream )
{
while ( stream.advance_to_next_tag() )
{
parse_attributes_of_tag( stream.begin(), stream.end() );
}
} |
Partager