#include #include #include #include #include #include #define BUFFER_LENGTH 256 #define ATTR_LENGTH 64 #define TAG_LENGTH 64 #define URL_LENGTH 1024 char buff[BUFFER_LENGTH]; int offset = 0; int end = 0; int last_char = -1; int my_get_char(){ if( last_char > -1 ){ int temp = last_char; last_char = -1; return temp; } if( offset < end ){ return buff[offset++]; } else { end = read( STDIN_FILENO, buff, BUFFER_LENGTH ); if( !end ) exit(EXIT_SUCCESS); if( end == -1 ){ printf("Echec de la lecture sur l'entrée standard %d !\n", errno); perror(""); exit(EXIT_FAILURE); } offset = 1; return buff[0]; } } void my_put_back(char c){ last_char = c; } int cur = 0; int lineno = 1; char last_attr[ATTR_LENGTH+1]; char last_tag[TAG_LENGTH+1]; char url[URL_LENGTH + 1]; void newline(){ lineno++; } void ignore_ws(){ while( isspace(cur) ) { if( cur == '\n' ) newline(); cur = my_get_char(); } } int expect(char *s){ for( ; *s ; s++ ){ if( (cur = my_get_char()) != *s ){ if( cur == '\n' ) newline(); return 0; } } return 1; } void go_char(char c){ while( (cur = my_get_char()) != c ){ if( cur == '\n' ) newline(); } } // tokens : void _document(); void _out(); void _tag(); void _tag_name(); // set last_tag void _tag_content(); void _comment_content(); int _attr_name(); // set last_attr void _attr_content(); void _document(){ go_char('<'); if( (cur = my_get_char()) == '!' ){ go_char('>'); _out(); } else { my_put_back(cur); _tag(); _out(); } } void _out(){ for(;;){ go_char('<'); _tag(); } } void _tag(){ cur = my_get_char(); // end tag if( cur == '/' ){ cur = my_get_char(); _tag_name(); ignore_ws(); if( ! (cur = my_get_char()) == '>' ){ fprintf(stderr, "Le tag fermant de %s à la ligne %d contient des caractères" "innattendus !\n", last_tag, lineno); go_char('>'); } return; } // comment case else if( cur == '!' ){ if( ! expect("--" ) ){ fprintf(stderr, "Le tag commençant par ! à la ligne %d semble ne pas être " "un commentaire, tentative de récupération.\n", lineno); go_char('>'); } _comment_content(); return; } // start tag else { _tag_name(); _tag_content(); // start-end tag if( cur == '/' ){ go_char('>'); } if( ! cur == '>' ){ fprintf(stderr, "Le tag ouvrant de %s à la ligne %d contient des caractères" "innattendus !\n", last_tag, lineno); go_char('>'); } return; } } void _tag_name(){ int i = 0; while( isalpha(cur) ){ if( i < TAG_LENGTH ){ last_tag[i++] = cur; } else { last_tag[i] = 0; fprintf(stderr, "Tag trop long à la ligne %d : %s\n", lineno, last_tag); return; } cur = my_get_char(); } last_tag[i] = 0; return; } void _tag_content(){ for(;;){ ignore_ws(); int i = _attr_name(); if( ! i ) return; if( i == -1 ){ go_char('>'); return; } ignore_ws(); if( cur == '=' ){ cur = my_get_char(); ignore_ws(); _attr_content(); } } } void _comment_content(){ for(;;){ go_char('-'); if( expect("->") ){ return; } } } int _attr_name(){ int i = 0; while( isalpha(cur) ){ if( i < ATTR_LENGTH ){ last_attr[i++] = cur; } else { last_attr[i] = 0; fprintf(stderr, "Attr trop long à la ligne %d : %s\n", lineno, last_attr); return -1; } cur = my_get_char(); } last_attr[i] = 0; return i; } void _attr_content(){ int i = 0; if( cur == '"' ){ cur = my_get_char(); while( cur != '"' ) { if( i < URL_LENGTH ){ url[i++] = cur; } else { url[i] = 0; fprintf(stderr, "URL trop longue à la ligne %d : %s\n", lineno, url); return; } if( cur == '\\' ){ cur = my_get_char(); if( i < URL_LENGTH ){ url[i++] = cur; } else { url[i] = 0; fprintf(stderr, "URL trop longue à la ligne %d : %s\n", lineno, url); return; } } cur = my_get_char(); } } else { while( ! isspace(cur) || cur == '>' ) { if( i < URL_LENGTH ){ url[i++] = cur; } else { url[i] = 0; fprintf(stderr, "URL trop longue à la ligne %d : %s\n", lineno, url); return; } if( cur == '\\' ){ cur = my_get_char(); if( i < URL_LENGTH ){ url[i++] = cur; } else { url[i] = 0; fprintf(stderr, "URL trop longue à la ligne %d : %s\n", lineno, url); return; } } cur = my_get_char(); } } url[i] = 0; if( (!strcmp( last_tag, "a" ) || !strcmp( last_tag, "A" ) ) && (!strcmp( last_attr, "href" ) || !strcmp( last_attr, "HREF" ) ) ) { printf("%s\n", url); } else if( (!strcmp( last_tag, "img" ) || !strcmp( last_tag, "IMG" ) ) && (!strcmp( last_attr, "src" ) || !strcmp( last_attr, "SRC" ) ) ) { printf("%s\n", url); } return; } int main() { _document(); return EXIT_SUCCESS; }