CUrl

 1 #ifndef _URL_H_030728_
 2 #define _URL_H_030728_
 3 
 4 #include <string>
 5 
 6 const unsigned int URL_LEN    = 256;
 7 const unsigned int HOST_LEN    = 256;
 8 
 9 using namespace std;
10 
11 
12 enum url_scheme {
13     SCHEME_HTTP,
14     SCHEME_FTP,
15     SCHEME_INVALID
16 };
17 
18 const int DEFAULT_HTTP_PORT = 80;
19 const int DEFAULT_FTP_PORT  = 21;
20 
21 class CUrl
22 {
23 public:
24     string m_sUrl;            // 原始的url地址
25     enum url_scheme m_eScheme;    // URL 类型
26 
27     string    m_sHost;        // 提取出来的主机地址
28     int    m_nPort;        // 主机端口号
29     string    m_sPath;        //路径
30 
31 
32 public:
33     CUrl();
34     ~CUrl();
35 
36     //bool ParseUrl(string strUrl);
37 
38     // break  an URL into scheme, host, port and request.
39     // result as member variants
40     bool ParseUrlEx(string strUrl);
41 
42     // break an URL into scheme, host, port and request.
43     // result url as argvs
44     void ParseUrlEx(const char *url, char *protocol, int lprotocol,
45             char *host, int lhost,
46             char *request, int lrequest, int *port);
47 
48     // get the ip address by host name
49     char *GetIpByHost(const char *host);
50 
51     bool IsValidHost(const char *ip);
52     bool IsForeignHost(string host);
53     bool IsImageUrl(string url);
54     bool IsValidIp(const char *ip);
55     bool IsVisitedUrl(const char *url);
56     bool IsUnReachedUrl(const char *url);
57     bool IsValidHostChar(char ch);
58 
59 //private:
60     void ParseScheme (const char *url);
61 };
62 
63 extern pthread_mutex_t mutexMemory;
64 
65 #endif /* _URL_H_030728_ */
  1 /* URL handling
  2  */
  3 
  4 #include <iostream>
  5 #include <string.h>
  6 #include <sys/socket.h>
  7 #include <netdb.h>
  8 #include <map>
  9 #include "Url.h"
 10 #include <stdlib.h>
 11 #include <arpa/inet.h>
 12 
 13 //#include "Tse.h"
 14 //#include "Url.h"
 15 //#include "Http.h"
 16 //#include "Md5.h"
 17 //#include "StrFun.h"
 18 
 19 
 20 
 21 //
 22 ///* Is X "."?  */
 23 #define DOTP(x) ((*(x) == '.') && (!*(sdfx + 1)))
 24 ///* Is X ".."?  */
 25 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
 26 
 27 map<string,string> mapCacheHostLookup;
 28 //extern vector<string> vsUnreachHost;
 29 //pthread_mutex_t mutexCacheHost = PTHREAD_MUTEX_INITIALIZER;
 30 //extern set<string> setVisitedUrlMD5;
 31 //extern map<unsigned long,unsigned long> mapIpBlock;
 32 typedef map<string,string>::value_type valTypeCHL;
 33 
 34 struct scheme_data
 35 {
 36     const char *leading_string;
 37     int default_port;
 38     int enabled;
 39 };
 40 
 41 /* 支持的网页类型 */
 42 static struct scheme_data supported_schemes[] =
 43 {
 44     { "http://",  DEFAULT_HTTP_PORT,  1 },
 45     { "ftp://",   DEFAULT_FTP_PORT,   1 },
 46 
 47     /* 不合法的网页 */
 48     { NULL,       -1,                 0 }
 49 };
 50 
 51 /* 分析类型,若是合法则返回正确的,否则是不合法的  */
 52 void CUrl::ParseScheme (const char *url)
 53 {
 54     int i;
 55 
 56     for (i = 0; supported_schemes[i].leading_string; i++)
 57         if (0 == strncasecmp (url, supported_schemes[i].leading_string,strlen (supported_schemes[i].leading_string)))//比较url的前几个字母
 58         {
 59             if (supported_schemes[i].enabled)
 60             {
 61                 this->m_eScheme = (enum url_scheme) i;
 62                 return;
 63             }
 64             else
 65             {
 66                 this->m_eScheme = SCHEME_INVALID;
 67                 return;
 68             }
 69         }
 70 
 71     this->m_eScheme = SCHEME_INVALID;
 72     return;
 73 }
 74 
 75 /************************************************************************
 76  *  Function name: ParseUrlEx
 77  *  Input argv:
 78  *      -- strUrl: url
 79  *  Output argv:
 80  *      --
 81  *  Return:
 82        true: success
 83        false: fail
 84  *  Fucntion Description: break an URL into scheme, host, port and request.
 85  *              result as member variants
 86  *  Be careful:    release the memory by the client
 87 ************************************************************************/
 88 
 89 bool CUrl::ParseUrlEx(string strUrl)
 90 {
 91     char protocol[10];
 92     char host[HOST_LEN];
 93     char request[256];
 94     int port = -1;
 95 
 96     memset( protocol, 0, sizeof(protocol) );
 97     memset( host, 0, sizeof(host) );
 98     memset( request, 0, sizeof(request) );
 99 
100     this->ParseScheme(strUrl.c_str());
101     if( this->m_eScheme != SCHEME_HTTP )
102     {
103         return false;
104     }
105 
106     ParseUrlEx(strUrl.c_str(),
107             protocol, sizeof(protocol),
108             host, sizeof(host),
109             request, sizeof(request),
110             &port);
111 
112     m_sUrl  = strUrl;
113     m_sHost = host;
114     m_sPath = request;
115 
116     if( port > 0 ){
117         m_nPort = port;
118     }
119 
120     return true;
121 }
122 
123 /************************************************************************
124  *  Function name: ParseUrlEx
125  *  Input argv:
126  *      -- url: host name
127  *      -- protocol: result protocol
128  *      -- lprotocol: protocol length
129  *      -- host: result host
130  *      -- lhost: host length
131  *      -- request: result request
132  *      -- lrequest: request length
133  *  Output argv:
134  *      --
135  *  Return:
136        true: success
137        false: fail
138  *  Fucntion Description: break an URL into scheme, host, port and request.
139  *              result as argvs
140  *  Be careful:
141 ************************************************************************/
142 void CUrl::ParseUrlEx(const char *url,
143         char *protocol, int lprotocol,
144         char *host, int lhost,
145         char *request, int lrequest,
146         int *port)
147 {
148     char *work,*ptr,*ptr2;
149 
150     *protocol = *host = *request = 0;
151     *port = 80;
152 
153     int len = strlen(url);
154     //pthread_mutex_lock(&mutexMemory);
155     work = new char[len + 1];
156     //pthread_mutex_unlock(&mutexMemory);
157     memset(work, 0, len+1);
158     strncpy(work, url, len);
159     //把url的内容复制到work中
160 
161     // find protocol if any
162 //在work中查找:(默认的是http)
163     ptr = strchr(work, ':');
164     if( ptr != NULL )
165     {
166         *(ptr++) = 0;
167         strncpy( protocol, work, lprotocol );
168     } else {
169         strncpy( protocol, "HTTP", lprotocol );
170         ptr = work;
171     }
172 
173     // skip past opening /'s
174 //调过 // 
175     if( (*ptr=='/') && (*(ptr+1)=='/') )
176         ptr+=2;
177 
178     // 查找主机地址
179     ptr2 = ptr;
180     while( IsValidHostChar(*ptr2) && *ptr2 )
181         ptr2++;
182     *ptr2 = 0;//保证合法的字符串
183     strncpy( host, ptr, lhost );
184 
185     //查找请求的网页
186     int offset = ptr2 - work;
187     const char *pStr = url + offset;
188     strncpy( request, pStr, lrequest );
189 
190     //找到主机的端口
191     ptr = strchr( host, ':' );
192     if( ptr != NULL ){
193         *ptr = 0;
194         *port = atoi(ptr+1);
195     }
196 
197     //pthread_mutex_lock(&mutexMemory);
198     delete [] work;
199     //pthread_mutex_unlock(&mutexMemory);
200     work = NULL;
201 }
202 
203 
204 
205 
206 
207 
208 /* scheme://user:pass@host[:port]... 
209  *                    ^              
210  * We attempt to break down the URL into the components path,
211  * params, query, and fragment.  They are ordered like this:
212  * scheme://host[:port][/path][;params][?query][#fragment] 
213  */
214 
215 /*
216 bool CUrl::ParseUrl(string strUrl)
217 {
218     string::size_type idx;
219 
220     this->ParseScheme(strUrl.c_str());    
221     if( this->m_eScheme != SCHEME_HTTP )
222         return false;
223 
224     // get host name
225     this->m_sHost = strUrl.substr(7);
226     idx = m_sHost.find('/');
227     if(idx != string::npos){
228         m_sHost = m_sHost.substr(0,idx);
229     }
230 
231     this->m_sUrl = strUrl;
232 
233     return true;
234 }
235 */
236 //CUrl的构造函数
237 CUrl::CUrl()
238 {
239     this->m_sUrl = ""; 
240     this->m_eScheme= SCHEME_INVALID;
241         
242     this->m_sHost = "";  
243     this->m_nPort = DEFAULT_HTTP_PORT; //默认端口
244         
245     this->m_sPath = "";
246     /*
247     this->m_sParams = "";
248     this->m_sQuery = "";
249     this->m_sFragment = "";
250 
251     this->m_sDir = "";
252     this->m_sFile = "";
253         
254         this->m_sUser = "";
255     this->m_sPasswd = "";
256     */
257 
258 }
259 
260 CUrl::~CUrl()
261 {
262 
263 }
264 
265 
266 /****************************************************************************
267  *  Function name: GetIpByHost
268  *  Input argv:
269  *      -- host: host name
270  *  Output argv:
271  *      --
272  *  Return:
273        ip: sucess
274        NULL: fail
275  *  Function Description: get the ip address by host name
276  *  Be careful: release the memory by the client
277 ****************************************************************************/
278 //通过主机地址获得IP地址
279 char * CUrl::GetIpByHost(const char *host)
280 {
281         
282     if( !host ){    // null pointer
283         return NULL;
284         cout<<"f1";
285     }
286 
287     if( !IsValidHost(host) ){    // invalid host
288         return NULL;
289         cout<<"f2";
290     }
291     unsigned long inaddr = 0;
292     char *result = NULL;
293     int len = 0;
294 
295 
296     inaddr = (unsigned long)inet_addr( host );//将字符串IP转化为32二进制的网络字节序
297     //if ( (int)inaddr != -1){ 
298     if ( inaddr != INADDR_NONE)
299     { // 主机地址就是用IP地址表示的
300         len = strlen(host);
301         //pthread_mutex_lock(&mutexMemory);
302         result = new char[len+1];
303         cout<<result;
304         //pthread_mutex_unlock(&mutexMemory);
305         memset(result, 0, len+1);
306         memcpy(result, host, len);
307 
308         return result;
309     } 
310     else 
311     {
312         //firt find from cache
313         
314         map<string,string>::iterator it  = mapCacheHostLookup.find(host);
315         //可以在DNS缓存中找到
316         if( it != mapCacheHostLookup.end() )
317         {    //如果在cache中找到IP地址
318             const char * strHostIp;
319 
320             strHostIp = (*it).second.c_str();
321 
322             inaddr = (unsigned long)inet_addr( strHostIp );
323             //if ( (int)inaddr != -1){ 
324             if ( inaddr != INADDR_NONE )
325             { 
326                 len = strlen(strHostIp);
327                 //pthread_mutex_lock(&mutexMemory);
328                 result = new char[len+1];
329                 //pthread_mutex_unlock(&mutexMemory);
330                 memset( result, 0, len+1 );
331                 memcpy( result, strHostIp, len );
332 
333                 //cout << ":)" ;
334                 
335                 return result;
336             }
337         }
338     }
339 
340     //通过上面的方法我们都没有查找,这个时候我们只能通过DNS server查找了,这种带宽的消耗是必要的!
341     struct hostent *hp;    /* Host entity */
342     hp = gethostbyname(host);
343     //通过主机号或者说是域名得到hostent结构,这个结构包含主机号或者说域名的很多信息,例如我们要找的IP字符串就在其中
344     if(hp == NULL) { 
345         //cout << "gethostbyname() error in GetIpByHost: " << host << endl;
346         return NULL;
347     }
348 
349     // cache host lookup
350     struct  in_addr in;
351 
352     bcopy(*(hp->h_addr_list), (caddr_t)&in, hp->h_length);
353     /*功能:将字符串src的前n个字节复制到dest中
354      说明:bcopy不检查字符串中的空字节NULL,函数没有返回值。*/
355         
356     char    abuf[INET_ADDRSTRLEN];
357     if( inet_ntop(AF_INET, (void *)&in,abuf, sizeof(abuf)) == NULL )
358     {
359         cout << "inet_ntop() return error in GetIpByHost" << endl;
360         return NULL;
361 
362     } 
363     else
364     {
365 
366         //if( mapCacheHostLookup.count(host) == 0){
367         if( mapCacheHostLookup.find(host) == mapCacheHostLookup.end() ){
368         
369             //cout << endl << host << " and " << abuf << endl;
370             mapCacheHostLookup.insert( valTypeCHL ( host, abuf));
371             //更新DNS缓存
372             //cout<<((*mapCacheHostLookup.find("home.ustc.edu.cn")).second.c_str());
373 
374         }
375 
376     }
377 
378     // return result
379     len = strlen(abuf);
380     //pthread_mutex_lock(&mutexMemory);
381     result = new char[len + 1];
382     //pthread_mutex_unlock(&mutexMemory);
383     memset( result, 0, len+1 );
384     memcpy( result, abuf, len );
385 
386     return result;
387 }
388 
389 /**********************************************************************************
390  *  Function name: IsValidHostChar
391  *  Input argv:
392  *      -- ch: the character for testing
393  *  Output argv:
394  *      -- 
395  *  Return:
396        true: is valid
397        false: is invalid
398  *  Function Description: test the specified character valid
399  *              for a host name, i.e. A-Z or 0-9 or -.:
400 **********************************************************************************/
401 bool CUrl::IsValidHostChar(char ch)
402 {
403     return( isalpha(ch) || isdigit(ch)
404         || ch=='-' || ch=='.' || ch==':' || ch=='_');
405 }
406 
407 /**********************************************************************************
408  *  Function name: IsValidHost
409  *  Input argv:
410  *      -- ch: the character for testing
411  *  Output argv:
412  *      -- 
413  *  Return:
414        true: is valid
415        false: is invalid
416  *  Function Description: test the specified character valid
417  *              for a host name, i.e. A-Z or 0-9 or -.:
418  *  Be careful:
419 **********************************************************************************/
420 bool CUrl::IsValidHost(const char *host)
421 {
422     if( !host ){//空的主机号,我们认为是无效的主机号
423         return false;
424     }
425 
426     if( strlen(host) < 6 ){ //主机号长度小于6,我们认为ieshi无效的主机号
427         return false;
428     }
429 
430     char ch;
431     for(unsigned int i=0; i<strlen(host); i++){
432         ch = *(host++);
433         if( !IsValidHostChar(ch) ){
434             return false;
435         }
436     }
437 
438     return true;
439 }
440 
441 /**********************************************************************************
442  *  Function name: IsVisitedUrl
443  *  Input argv:
444  *      -- url: url
445  *  Output argv:
446  *      -- 
447  *  Return:
448        true: is visited
449        false: not visited
450  *  Function Description: test the url visited by the MD5
451  *  Be careful:
452 **********************************************************************************/
453 bool CUrl::IsVisitedUrl(const char *url)//判断该URL是否访问过
454 {
455     if( !url ){
456         return true; // if be null, we think it have been visited
457     }
458 
459     CMD5 iMD5;
460     iMD5.GenerateMD5( (unsigned char*)url, strlen(url) );
461     string strDigest = iMD5.ToString();
462 
463     if( setVisitedUrlMD5.find(strDigest) != setVisitedUrlMD5.end() ) {
464         return true;
465     } else {
466         return false;
467     }
468 
469 }
470 
471 
472 /**********************************************************************************
473  *  Function name: IsValidIp
474  *  Input argv:
475  *      -- ip: ip
476  *  Output argv:
477  *      -- 
478  *  Return:
479        true: inside the ip block
480        false: outside the ip block
481  *  Function Description: decide teh ip whether or not inside the ip block
482  *  Be careful:
483 **********************************************************************************/
484 bool CUrl::IsValidIp(const char *ip)
485 {
486     if( ip == NULL )
487     {
488         return false;
489     }
490 
491     unsigned long inaddr = (unsigned long)inet_addr(ip);
492     if( inaddr == INADDR_NONE ){//显然该IP参数不是正确的字符串IP
493         return false;
494     }
495 
496     if (mapIpBlock.size() > 0) { //判断是否要过滤掉
497         map<unsigned long, unsigned long>::iterator pos;
498         for (pos = mapIpBlock.begin(); pos != mapIpBlock.end(); ++pos) {
499             unsigned long ret;
500 
501             ret = inaddr & ~((*pos).second);
502             if (ret == (*pos).first) { // inside
503                 return true;
504             }
505         }
506 
507         // outside
508         return false;
509     }
510 
511 
512     // if block range is not given, we think it inside also
513     return true;
514 }
515 /*
516  * If it is, return true; otherwise false
517  * not very precise
518  */
519 bool CUrl::IsForeignHost(string host)
520 {
521     if( host.empty() ) return true;
522     if( host.size() > HOST_LEN ) return true;
523 
524     unsigned long inaddr = 0;
525 
526     inaddr = (unsigned long)inet_addr( host.c_str() );
527     if ( inaddr != INADDR_NONE){ // host is just ip
528         return false;
529     }
530 
531     string::size_type idx = host.rfind('.');
532     string tmp;
533     if( idx != string::npos ){
534         tmp = host.substr(idx+1);
535     }
536 
537     CStrFun::Str2Lower( tmp, tmp.size() );
538     const char *home_host[] ={
539         "cn","com","net","org","info",
540         "biz","tv","cc", "hk", "tw"
541     };
542 
543     int home_host_num = 10;
544 
545     for(int i=0; i<home_host_num; i++){
546         if( tmp == home_host[i] )
547             return false;
548     }
549 
550     return true;
551 }
552     
553     
554 bool CUrl::IsImageUrl(string url)
555 {
556     if( url.empty() ) return false;
557     if( url.size() > HOST_LEN ) return false;
558 
559     string::size_type idx = url.rfind('.');
560     string tmp;
561     if( idx != string::npos ){
562         tmp = url.substr(idx+1);
563     }
564 
565     CStrFun::Str2Lower( tmp, tmp.size() );
566     const char *image_type[] ={
567         "gif","jpg","jpeg","png","bmp",
568         "tif","psd"
569     };
570 
571     int image_type_num = 7;
572 
573     for (int i=0; i<image_type_num; i++)
574     {
575         if( tmp == image_type[i] )
576             return true;
577     }
578 
579     return false;
580 }
原文地址:https://www.cnblogs.com/kakamilan/p/2578412.html