extraire des urls d'une page web

**Zorgloub** · 06/11/2007, 18h35

Bonjour,

J'essai d'extraire des liens d'une page web. J'avais 2 possibilités qui s'offraient à moi:

*considérer la page web comme un arbre et en extraire les liens
*lire ma page ligne par ligne et utiliser des regexs pour extraire des liens

Je viens du perl et j'ai exceptionnellement besoin de performance. J'ai lu, à juste titre, que charger ma page comme un arbre occuperai plus de place en mémoire que de lire le fichier ligne à ligne. (Et cela serai sans-doute plus long de charger la page puis d'analyser les noeuds). J'ai donc choisi de lire ma page ligne par ligne, et d'utiliser des regexs pour en extraire des liens

J'aimerai faire l'équivalent d'un

@matches = m#(http://www.*\.com)#g

qui me stockera donc mes liens dans un tableau. Pour télécharger ma page, j'utilise libcurl avec en particulier cette exemple :

http://curl.haxx.se/lxr/source/docs/examples/getinmemory.c

Je modifierai le code pour le wrapper en C++.

Je ne comprends pas très bien ce qu'est la structure chunk.memory, même si j'arrive à l'afficher comme une string à l'aide de std::cout << chunk.memory;

Je me demandais donc comment faire pour :

1) lire cette structure (qui ressemble fortement à un string) ligne à ligne
2) comment stocker plusieurs urls dans un vector de string comme je l'aurai simplement fait en perl

Merci pour votre attention.

**Aurelien.Regat-Barrel** · 08/11/2007, 18h07

Note que y'a surement moyen de trouver des softs qui font déja ca (wget...).

J'avais fait "mumuse" avec ca y'a quelques temps. Voici en gros ce que j'avais pondu:

Code :

Sélectionner tout - Visualiser dans une fenêtre à part

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#ifndef HTML_STREAM_H
#define HTML_STREAM_H
 
#include <string>
 
class html_stream
{
public:
	html_stream( const std::string & );
 
	std::string::const_iterator begin() const;
 
	std::string::const_iterator end() const;
 
	// return false if there is no more tag
	bool advance_to_next_tag();
 
	bool is_current_tag_type( const std::string & ) const;
 
	bool is_current_tag_self_closed() const;
 
	void set_end_to_include_next_closing_tag();
 
private:
	std::string::const_iterator i_begin;
	std::string::const_iterator	i_end;
	std::string		Buffer;
};
 
#endif

Code :

Sélectionner tout - Visualiser dans une fenêtre à part

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#include "html_stream.h"
 
#include <tr1/regex>
using namespace std::tr1;
 
html_stream::html_stream( const std::string & Data ):
    Buffer( Data )
{
	i_begin = Buffer.begin();
	i_end = i_begin;
}
 
std::string::const_iterator html_stream::begin() const
{
	return i_begin;
}
 
std::string::const_iterator html_stream::end() const
{
	return i_end;
}
 
bool html_stream::advance_to_next_tag()
{
	if ( i_begin == Buffer.end() )
	{
		return false;
	}
 
	while ( ++i_begin != Buffer.end() )
	{
		if ( *i_begin == '<' )
		{
			// look for matching '>'
			i_end = i_begin;
			while ( ++i_end != Buffer.end() )
			{
				if ( *i_end == '>' )
				{
					++i_end;
					return true;
				}
			}
		}
	}
	i_end = i_begin;
	return false;
}
 
bool html_stream::is_current_tag_type( const std::string & tag_name ) const
{
	if ( *i_begin != '<' )
	{
		throw std::runtime_error( "no current tag" );
	}
 
	regex pattern( "<(\\s*)" + tag_name + "(.*)>", regex_constants::icase );
	return regex_match( i_begin, i_end, pattern );
}
 
bool html_stream::is_current_tag_self_closed() const
{
	static regex pattern( "<(.*)/(\\s*)>" );
	return regex_match( i_begin, i_end, pattern );
}
 
void html_stream::set_end_to_include_next_closing_tag()
{
	if ( i_end == Buffer.end() )
	{
		return;
	}
 
	while ( ++i_end != Buffer.end() )
	{
		if ( *i_end == '<' )
		{
			// validate "</"
			while ( ++i_end != Buffer.end() )
			{
				if ( *i_end == ' ' ) { continue; }
				if ( *i_end == '/' )
				{
					break;
				}
			}
			if ( i_end == Buffer.end() )
			{
				// failed
				return;
			}
 
			// look for matching '>'
			while ( ++i_end != Buffer.end() )
			{
				if ( *i_end == '>' )
				{
					++i_end;
					break;
				}
			}			
		}
	}
}

Code :

Sélectionner tout - Visualiser dans une fenêtre à part

1
2
3
4
5
6
7
8
9
10
11
12
#ifndef HTML_PARSER_H
#define HTML_PARSER_H
 
class html_stream;
 
class html_parser
{
public:
	void parse_stream( html_stream & );
};
 
#endif

Code :

Sélectionner tout - Visualiser dans une fenêtre à part

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
 
#include "html_parser.h"
#include "html_stream.h"
 
#include <tr1/regex>
using namespace std::tr1;
 
#include <iostream>
using namespace std;
 
#include <cstring>
#include <set>
 
void add_href( std::string url )
{
    size_t pos = url.find( '#' );
    if ( pos != string::npos )
    {
        url.resize( pos );
    }
    if ( url.empty() )
    {
        return;
    }
 
    static set<string> urls;
    if ( urls.find( url ) == urls.end() )
    {
        urls.insert( url );
        cout << "URL: " << url << "\n";
    }
}
 
void add_image_src( const std::string & url )
{
    static set<string> urls;
    if ( urls.find( url ) == urls.end() )
    {
        urls.insert( url );
        cout << "IMAGE: " << url << "\n";
    }
}
 
void handle_attribute( std::string tag_name, std::string attr_name, const std::string & attr_value )
{
	// change to lower case
	std::transform( tag_name.begin(), tag_name.end(), tag_name.begin(), tolower );
	std::transform( attr_name.begin(), attr_name.end(), attr_name.begin(), tolower );
 
	if ( tag_name == "a" && attr_name == "href" )
	{
		add_href( attr_value );
	}
	else if ( tag_name == "img" && attr_name == "src" )
	{
		add_image_src( attr_value );
	}
	else if ( tag_name == "link" && attr_name == "href" )
	{
		add_href( attr_value );
	}
}
 
void parse_attributes_of_tag( std::string::const_iterator begin, std::string::const_iterator end )
{
	if ( *begin != '<' )
	{
		throw std::invalid_argument( "Not the begining of an HTML tag" );
	}
 
	//cout << "Parsing {" << string( begin, end ) << "}\n";
 
	static regex tag_with_attr_pattern( "<(\\s*)(\\w+)(.*=.*)>" );
	boost::smatch r;
	if ( regex_search( begin, end, r, tag_with_attr_pattern ) )
	{
		// found a tag with at least one attribute		
		string tag_name = r[2];
		string attributes = r[3];
 
		std::string::const_iterator begin = attributes.begin();
		std::string::const_iterator end = attributes.end();
 
		while ( begin != end )
		{
			// extract 1 attribute
			static regex single_quoted_attr_extraction_pattern( "\\s*(\\w+)\\s*=\\s*'([^']*)'" );
			static regex double_quoted_attr_extraction_pattern( "\\s*(\\w+)\\s*=\\s*\"([^\"]*)\"" );
			static regex unquoted_attr_extraction_pattern( "\\s*(\\w+)\\s*=\\s*(\\w+)" );
			boost::smatch r;
			if ( regex_search( begin, end, r, single_quoted_attr_extraction_pattern ) )
			{
				handle_attribute( tag_name, r[1], r[2] );
			}
			else if ( regex_search( begin, end, r, double_quoted_attr_extraction_pattern ) )
			{
				handle_attribute( tag_name, r[1], r[2] );
			}
			else if ( regex_search( begin, end, r, unquoted_attr_extraction_pattern ) )
			{
				handle_attribute( tag_name, r[1], r[2] );
			}
			else
			{
				break;
			}
			// advance in string
			begin += r.length();
		}
	}
}
 
void html_parser::parse_stream( html_stream & stream )
{
	while ( stream.advance_to_next_tag() )
	{
		parse_attributes_of_tag( stream.begin(), stream.end() );
	}
}

Code :

Sélectionner tout - Visualiser dans une fenêtre à part

1
2
3
4
  // parse html content
  html_stream stream( this->zData );
  html_parser parser;
  parser.parse_stream( stream );

Ca utilise les regex de boost.TR1.

**Zorgloub** · 10/11/2007, 00h16

Merci beaucoup!

Je sais que wget le fait, mais il n'est pas multithread (il existe cependant PUF Parallel URL Fetcher qui le fait, mais ne correspond pas à mes besoins).

J'ai d'autres soucis maintenant pour linker boost::regex, mais je vais expliquer ça dans un autre post.

Merci encore!

extraire des urls d'une page web

C++

Vue hybride

Discussions similaires

Partager

Partager