当前位置:首页 > 读后感 > 用 C 语言编写一个网络蜘蛛来搜索网上出现的电子邮件地址_编写C
 

用 C 语言编写一个网络蜘蛛来搜索网上出现的电子邮件地址_编写C

发布时间:2019-07-25 09:24:33 影响了:

用 C 语言编写一个网络蜘蛛来搜索网上出现的电子邮件地址

可能大家经常要去互联网上搜索特定的内容,比如收集大量邮件地址,如果用 google 之类的搜索引擎是没法实现这种特定功能的,所以用 C 语言来写一个吧。它的功能就是不断去取得网络上的页面,然后分析出网页上出现的邮件地址保存下来。象个蜘蛛一样,从网络上一个网页爬向另一个网页,不停止地搜索邮件地址。

当然这只是一个原理展示程序,并没有进行优化。

这个程序的 main 函数流程图如下:

.png

即:分析程序运行时的参数,把各网页地址作为根节点加入到链表,然后从链表头开始处理各节点

对整个链表的处理是先处理兄弟节点,流程图如下:

.png

然后再处理各节点的子节点,流程图如下:

.png

当然,这里采用了递归调用方法,处理子节点的数据时和处理整个链表一样循环处理就是了。

/************关于本文档********************************************

*filename: 用 C 语言编写一个网络蜘蛛来搜索网上出现的电子邮件地址

*purpose: 一个邮址搜索程序的雏形

*wrote by: zhoulifa(zhoulifa@163.com) 周立发()

Linux爱好者 Linux知识传播者 SOHO族 开发者 最擅长C语言

*date time:2006-08-31 21:00:00

*Note: 任何人可以任意复制代码并运用这些文档,当然包括你的商业用途

* 但请遵循GPL

*Hope:希望越来越多的人贡献自己的力量,为科学技术发展出力

*********************************************************************/

程序在运行的过程中要建立一个树形链表结构,结构图如下:

/inc/mails4.png

程序启动时分析所带参数,把各参数加入到根网页节点,如果有多个参数则这个根网页有兄弟节点。

然后从根节点开始处理这一级上各节点,把各节点网页上出现的网页链接加到该节点的子节点上,处理完当前这一级后处理子节点这一级。

源代码如下:

#include

#include

#include

#include

#include

#include

#include

#include

#include

#include

#include

#define USERAGENT "Wget/1.10.2"

#define ACCEPT "*/*"

#define ACCEPTLANGUAGE "zh-cn,zh;q=0.5"

#define ACCEPTENCODING "gzip,deflate"

#define ACCEPTCHARSET "gb2312,utf-8;q=0.7,*;q=0.7"

#define KEEPALIVE "300"

#define CONNECTION "keep-alive"

#define CONTENTTYPE "application/x-www-form-urlencoded"

#define MAXFILENAME 14

#define DEBUG 1

typedef struct webnode {

char * host; /* 网页所在的主机 */

int port; /* 网络服务器所使用的端口 */

char * dir; /* 网页所在的目录 */

char * page; /* 网页文件名 */

char * file; /* 本地保存的文件名 */

char IsHandled; /* 是否处理过 */

struct webnode * pother; /* 兄弟节点链表指针 */

struct webnode * child; /* 子节点链表指针 */

} WEBNODE;

struct sockaddr_in server_addr;

int sockfd = 0, dsend = 0, totalsend = 0, nbytes = 0, reqn = 0, i = 0, j = 0, ret = 0;

struct hostent *host;

char rquest[409600] = "", buffer[1024] = "", httpheader[1024] = "";

int FileNumber = 0;

char e[2] = "@/";

WEBNODE * NodeHeader, * NodeTail, * NodeCurr;

char * mapped_mem;

int GetHost(char * , char ** , char ** , int * , char ** ); /**/

void AnalyzePage(WEBNODE *); /**/

void AddInitNode(char *, char *, int, char * ); /**/

void HandleInitNode(WEBNODE *); /**/

void DisplayNode(WEBNODE *); /**/

void HandOneNode(WEBNODE *); /**/

void DoneWithList(int); /**/

void DoOnce(); /**/

void ConnectWeb(void); /**/

void SendRequest(void); /**/

void ReceiveResponse(void); /**/

void GetEmail(char * ); /**/

void GetLink(char * ); /**/

void GetBeforePos(char * , char ** ); /**/

void GetAfterPos(char * , char ** ); /**/

void AddChildNode(WEBNODE * , char * ); /**/

void GetAfterPosWithSlash(char * , char ** ); /**/

void GetMemory(char ** , int ); /**/

int IsExistWeb(WEBNODE * , char * , char * , int , char * ); /**/

void Rstrchr(char * , int , char ** ); /**/

int GetLocalAgent(char * UserAgent, char * Accept, char * AcceptLanguage, char * AcceptEncoding, char * AcceptCharset, char * KeepAlive, char * Connection, char * ContentType); /**/

/**************************************************************

功能:设置 HTTP 协议头内容的一些固定值

***************************************************************/

int GetLocalAgent(char * UserAgent, char * Accept, char * AcceptLanguage, char * AcceptEncoding, char * AcceptCharset, char * KeepAlive, char * Connection, char * ContentType)

{

memcpy(UserAgent, USERAGENT, strlen(USERAGENT));

memcpy(Accept, ACCEPT, strlen(ACCEPT));

memcpy(AcceptLanguage, ACCEPTLANGUAGE, strlen(ACCEPTLANGUAGE));

memcpy(AcceptEncoding, ACCEPTENCODING, strlen(ACCEPTENCODING));

memcpy(AcceptCharset, ACCEPTCHARSET, strlen(ACCEPTCHARSET));

memcpy(KeepAlive, KEEPALIVE, strlen(KEEPALIVE));

memcpy(Connection, CONNECTION, strlen(CONNECTION));

memcpy(ContentType, CONTENTTYPE, strlen(CONTENTTYPE));

return 0;

}

/**************************************************************

功能:在字符串 s 里搜索 x 字符,并设置指针 d 指向该位置

***************************************************************/

void Rstrchr(char * s, int x, char ** d)

{

int len = strlen(s) - 1;

while(len >= 0) {

if(x == s[len]) {(*d) = s + len; return;}

len--;

}

(*d) = 0;

}

/**************************************************************

功能:连接一个网站服务器

***************************************************************/

void ConnectWeb(void) { /* connect to web server */

/* create a socket descriptor */

if((sockfd=socket(PF_INET,SOCK_STREAM,0))==-1)

{

fprintf(stderr,"\tSocket Error:%s\a\n",strerror(errno));

exit(1);

}

/* bind address */

bzero(&server_addr, sizeof(server_addr));

server_addr.sin_family = AF_INET;

server_addr.sin_port = htons(NodeCurr->port);

server_addr.sin_addr = *((struct in_addr *)host->h_addr);

/* connect to the server */

if(connect(sockfd, (struct sockaddr *)(&server_addr), sizeof(struct sockaddr)) == -1)

{

fprintf(stderr, "\tConnect Error:%s\a\n", strerror(errno));

exit(1);

}

}

/**************************************************************

功能:向网站发送 HTTP 请求

***************************************************************/

void SendRequest(void) { /* send my http-request to web server */

dsend = 0;totalsend = 0;

nbytes=strlen(request);

while(totalsend

dsend = write(sockfd, request + totalsend, nbytes - totalsend);

if(dsend==-1) {fprintf(stderr, "\tsend error!%s\n", strerror(errno));exit(0);}

totalsend+=dsend;

fprintf(stdout, "\n\tRequest.%d %d bytes send OK!\n", reqn, totalsend);

}

}

/**************************************************************

功能:接收网站的 HTTP 返回

***************************************************************/

void ReceiveResponse(void) { /* get response from web server */

fd_set writefds;

struct timeval tival;

int retry = 0;

FILE * localfp = NULL;

i=0; j = 0;

__ReCeive:

FD_ZERO(&writefds);

tival.tv_sec = 10;

tival.tv_usec = 0;

if(sockfd > 0) FD_SET(sockfd, &writefds);

else {fprintf(stderr, "\n\tError, socket is negative!\n"); exit(0);}

ret = select(sockfd + 1, &writefds, NULL, NULL, &tival);

if(ret ==0 ) {

if(retry++

}

if(ret

if(FD_ISSET(sockfd, &writefds)) {

memset(buffer, 0, 1024);

memset(httpheader, 0, 1024);

if((localfp = fopen(NodeCurr->file, "w")) == NULL) {if(DEBUG) fprintf(stderr, "create file ‘%s‘ error\n", NodeCurr->file); return;}

/* receive data from web server */

while((nbytes=read(sockfd,buffer,1))==1)

{

if(i

if(buffer[0] == ‘\r‘ || buffer[0] == ‘\n‘) i++;

else i = 0;

memcpy(httpheader + j, buffer, 1); j++;

}

else { /* 获取 HTTP 消息体 */

fprintf(localfp, "%c", buffer[0]); /* print content on the screen */

//fprintf(stdout, "%c", buffer[0]); /* print content on the screen */

i++;

}

}

fclose(localfp);

}

}

/**************************************************************

功能:执行一次 HTTP 请求

***************************************************************/

void DoOnce() { /* send and receive */

ConnectWeb(); /* connect to the web server */

/* send a request */

SendRequest();

/* receive a response message from web server */

ReceiveResponse();

close(sockfd); /* because HTTP protocol do something one connection, so I can close it after receiving */

}

/**************************************************************

功能:执行 HTTP 请求

***************************************************************/

void DoneWithList(int flag) {

if(flag) fprintf(stdout, "\tRequest.%d is:\n%s", ++reqn, request);

DoOnce();

if(flag) fprintf(stdout, "\n\tThe following is the response header:\n%s", httpheader);

}

/**************************************************************

功能:从字符串 src 中分析出网站地址和端口,并得到文件和目录

***************************************************************/

int GetHost(char * src, char ** web, char ** file, int * port, char ** dir) {

char * pA, * pB, * pC;

int len;

*port = 0;

if(!(*src)) return -1;

pA = src;

if(!strncmp(pA, "https://", strlen("https://"))) pA = src+strlen("https://");

/* else if(!strncmp(pA, "https://", strlen("https://"))) pA = src+strlen("https://"); */

else return 1;

pB = strchr(pA, ‘/‘);

if(pB) {

len = strlen(pA) - strlen(pB);

GetMemory(web, len);

memcpy((*web), pA, len);

if(*(pB+1)) {

Rstrchr(pB + 1, ‘/‘, &pC);

if(pC) len = strlen(pB + 1) - strlen(pC);

else len = 0;

if(len > 0) {

GetMemory(dir, len);

memcpy((*dir), pB + 1, len);

if(pC + 1) {

len = strlen(pC + 1);

GetMemory(file, len);

memcpy((*file), pC + 1, len);

}

else {

len = 1;

GetMemory(file, len);

memcpy((*fie), e, len);

}

}

else {

len = 1;

GetMemory(dir, len);

memcpy((*dir), e + 1, len);

len = strlen(pB + 1);

GetMemory(file, len);

memcpy((*file), pB + 1, len);

}

}

else {

len = 1;

GetMemory(dir, len);

memcpy((*dir), e + 1, len);

len = 1;

GetMemory(file, len);

memcpy((*file), e, len);

}

}

else {

len = strlen(pA);

GetMemory(web, len);

memcpy((*web), pA, strlen(pA));

len = 1;

GetMemory(dir, len);

memcpy((*dir), e + 1, len);

len = 1;

GetMemory(file, len);

memcpy((*file), e, len);

}

pA = strchr((*web), ‘:‘);

if(pA) *port = atoi(pA + 1);

else *port = 80;

return 0;

}

/*********************************************************************

*filename: mailaddrsearch.c

*purpose: 用 C 语言编写一个网络蜘蛛来搜索网上出现的电子邮件地址

*tidied by: zhoulifa(zhoulifa@163.com) 周立发()

Linux爱好者 Linux知识传播者 SOHO族 开发者 最擅长C语言

*date time:2006-08-31 21:00:00

*Note: 任何人可以任意复制代码并运用这些文档,当然包括你的商业用途

* 但请遵循GPL

*Thanks to: 广东省 Linux 公共服务技术支持中心

*********************************************************************/

int main(int argc, char ** argv)

{

int WebPort;

char * WebHost = 0, * PageAddress = 0, * WebDir = 0;

if(argc

NodeHeader = NodeTail = NodeCurr = 0;

//setlocale(LC_ALL, "zh_CN.gb2312");

for(i = 1; i

ret = GetHost(argv[i], &WebHost, &PageAddress, &WebPort, &WebDir); /* Get web page info */

if(ret) {if(DEBUG) fprintf(stdout, "GetHost error from ‘%s‘\n", argv[i]); exit(0);}

AddInitNode(WebHost, PageAddress, WebPort, WebDir); /* add this page to chain */

}

free(WebHost); free(PageAddress);free(WebDir);

if(DEBUG) {

fprintf(stdout, "\nDisplay.%5d:", FileNumber);

DisplayNode(NodeHeader); /* display every node */

}

HandleInitNode(NodeHeader); /* handle every page */

return 0;

}

/**************************************************************

功能:分析网页

***************************************************************/

void AnalyzePage(WEBNODE * node)

{

int fd;

int flength = 0;

fd = open(node->file, O_RDONLY);

if(fd == -1) goto __AnalyzeDone;

flength = lseek(fd, 1, SEEK_END);

write(fd, "\0", 1);

lseek(fd, 0, SEEK_SET);

mapped_mem = mmap(0, flength, PROT_READ, MAP_PRIVATE, fd, 0);

GetEmail(mapped_mem);

GetLink(mapped_mem);

close(fd);

munmap(mapped_mem, flength);

__AnalyzeDone:

close(fd);

node->IsHandled = 1;

remove(node->file);

}

/**************************************************************

功能:为根节点设置兄弟节点

***************************************************************/

void AddInitNode(char * Host, char * Page, int Port, char * Dir)

{

WEBNODE * NewNode;

char filename[MAXFILENAME + 1] = "";

if(NodeHeader == NULL) NewNode = NodeHeader = (WEBNODE *)malloc(sizeof(WEBNODE));

else NodeTail->pother = NewNode = (WEBNODE *)malloc(sizeof(WEBNODE));

memset(NewNode, 0, sizeof(WEBNODE));

NewNode->host = (char *)malloc(strlen(Host) + 1);

memset(NewNode->host, 0, strlen(Host) + 1);

NewNode->page = (char *)malloc(strlen(Page) + 1);

memset(NewNode->page, 0, strlen(Page) + 1);

NewNode->dir = (char *)malloc(strlen(Dir) + 1);

memset(NewNode->dir, 0, strlen(Dir) + 1);

NewNode->file = (char *)malloc(MAXFILENAME + 1);

memset(NewNode->file, 0, MAXFILENAME + 1);

strcpy(NewNode->host, Host);

strcpy(NewNode->page, Page);

strcpy(NewNode->dir, Dir);

sprintf(filename, "file%05d.html", FileNumber++);

strcpy(NewNode->file, filename);

NewNode->port = Port;

NewNode->IsHandled = 0;

NewNode->pother = 0;

NewNode->child = 0;

NodeTail = NewNode;

}

/**************************************************************

功能:处理根节点信息

***************************************************************/

void HandleInitNode(WEBNODE * node)

{

WEBNODE * CurrentNode = 0;

CurrentNode = node;

if(CurrentNode) {

while(CurrentNode) {

if(CurrentNode->IsHandled == 0) {

HandOneNode(CurrentNode);

if(DEBUG) {

fprintf(stdout, "\nDisplay.%5d:", FileNumber);

DisplayNode(NodeHeader); /* display every node */

}

}

CurrentNode = CurrentNode->pother;

}

CurrentNode = node;

while(CurrentNode) {

if(CurrentNode->child && CurrentNode->child->IsHandled == 0) {

HandleInitNode(CurrentNode->child);

}

CurrentNode = CurrentNode->pother;

}

}

}

/**************************************************************

功能:显示年有节点信息

***************************************************************/

void DisplayNode(WEBNODE * NodeHeader)

{

WEBNODE * TempNode;

TempNode = NodeHeader;

fprintf(stdout, "\n");

while(TempNode) {

if(!strcmp(TempNode->dir, "/")) fprintf(stdout, "\t%s:%d%s%s => %s %d\n", TempNode->host, TempNode->port, TempNode->dir, strcmp(TempNode->page, "@")?TempNode->page:"", TempNode->file, TempNode->IsHandled);

else fprintf(stdout, "\t%s:%d/%s/%s => %s %d\n", TempNode->host, TempNode->port, TempNode->dir, strcmp(TempNode->page, "@")?TempNode->page:"", TempNode->file, TempNode->IsHandled);

TempNode = TempNode->pother;

}

TempNode = NodeHeader;

while(TempNode) {

if(TempNode->child) DisplayNode(TempNode->child);

TempNode = TempNode->pother;

}

}

/**************************************************************

功能:处理单个节点信息

***************************************************************/

void HandOneNode(WEBNODE * node)

{

char UserAgent[1024] = "", Accept[1024] = "", AcceptLanguage[1024] = "", AcceptEncoding[1024] = "", AcceptCharset[1024] = "", KeepAlive[1024] = "", Connection[1024] = "", ContentType[1024] = "";

NodeCurr = node;

if((host=gethostbyname(NodeCurr->host))==NULL) /* get ip address by domain */

{

if(DEBUG) fprintf(stderr,"\tGethostname ‘%s‘ error, %s\n", NodeCurr->host, strerror(errno));

exit(1);

}

GetLocalAgent(UserAgent, Accept, AcceptLanguage, AcceptEncoding, AcceptCharset, KeepAlive, Connection, ContentType); /* Get client powser information */

if(strcmp(NodeCurr->dir, "/")) sprintf(request, "GET /%s/%s HTTP/1.0\r\nHost: %s\r\nUser-Agent: %s\r\nAccept: %s\r\nConnection: %s\r\n\r\n", NodeCurr->dir, strcmp(NodeCurr->page, "@")?NodeCurr->page:"", NodeCurr->host, UserAgent, Accept, Connection);

else sprintf(request, "GET %s%s HTTP/1.0\r\nHost: %s\r\nUser-Agent: %s\r\nAccept: %s\r\nConnection: %s\r\n\r\n", NodeCurr->dir, strcmp(NodeCurr->page, "@")?NodeCurr->page:"", NodeCurr->host, UserAgent, Accept, Connetion);

DoneWithList(1);

AnalyzePage(NodeCurr);

}

/**************************************************************

功能:从字符串 src 中分析出邮件地址保存到文件

***************************************************************/

void GetEmail(char * src)

{

char * pa, * pb, * pc, *pd;

char myemail[1024] = "";

FILE * mailfp = NULL;

if((mailfp = fopen("email.txt", "a+")) == NULL) return;

pa = src;

while((pb = strchr(pa,‘@‘))) {

GetBeforePos(pb, &pc);

GetAfterPos(pb, &pd);

if(pc && pd && (strlen(pc) > (strlen(pd) + 3))) {

memset(myemail, 0, 1024);

memcpy(myemail, pc, strlen(pc) - strlen(pd));

if(strcmp(NodeCurr->dir, "/")) fprintf(mailfp, "%s\thttps://%s/%s/%s\n", myemail, NodeCurr->host, NodeCurr->dir, strcmp(NodeCurr->page, "@")?NodeCurr->page:"");

else fprintf(mailfp, "%s\thttps://%s%s%s\n", myemail, NodeCurr->host, NodeCurr->dir, strcmp(NodeCurr->page, "@")?NodeCurr->page:"");

if(*(pd + 1)) pa = pd + 1;

else peak;

}

else if(*(pb + 1)) pa = pb + 1;

else peak;

}

fclose(mailfp);

}

/**************************************************************

功能:从 src 中找出前面的字母、数字等内含,即 email 地址中 @ 的前面部分

***************************************************************/

void GetBeforePos(char * src, char ** d)

{

char * x;

if(src - 1) x = src - 1;

else {*d = 0; return ;}

while(x) {

if(*x >= ‘a‘ && *x

else if(*x >= ‘A‘ && *x

else if(*x >= ‘0‘ && *x

else if(*x == ‘.‘ || *x == ‘-‘ || *x == ‘_‘) {x--; continue;}

else {peak;}

}

x++;

if(x) *d = x;

else *d = 0;

}

/**************************************************************

功能:从 src 中找出后面的字母、数字等内含,即 email 地址中 @ 的后面部分

***************************************************************/

void GetAfterPos(char * src, char ** d)

{

char * x;

if(src + 1) x = src + 1;

else {*d = 0; return ;}

while(x) {

if(*x >= ‘a‘ && *x

else if(*x >= ‘A‘ && *x

else if(*x >= ‘0‘ && *x

else if(*x == ‘.‘ || *x == ‘-‘ || *x == ‘_‘) {x++; continue;}

else {peak;}

}

if(x) *d = x;

else *d = 0;

}

/**************************************************************

功能:从 src 中找出前面的字母、数字等内含,即一个网页地址中主机名后面的部分

***************************************************************/

void GetAfterPosWithSlash(char * src, char ** d)

{

char * x;

if(src) x = src;

else {*d = 0; return ;}

while(x) {

if(*x >= ‘a‘ && *x

else if(*x >= ‘A‘ && *x

else if(*x >= ‘0‘ && *x

else if(*x == ‘.‘ || *x == ‘-‘ || *x == ‘_‘ || *x == ‘=‘) {x++; continue;}

else if(*x == ‘:‘ || *x == ‘/‘ || *x == ‘?‘ || *x == ‘&‘) {x++; continue;}

else {peak;}

}

if(x) *d = x;

else *d = 0;

}

/**************************************************************

功能:为 myanchor 分配 len 大小的内存

***************************************************************/

void GetMemory(char ** myanchor, int len)

{

if(!(*myanchor)) (*myanchor) = (char *)malloc(len + 1);

else (*myanchor) = (char *)realloc((void *)(*myanchor), len + 1);

memset((*myanchor), 0, len + 1);

}

/**************************************************************

功能:从 src 中分析出网页链接,并加入到当前节点的子节点上

***************************************************************/

void GetLink(char * src)

{

char * pa, * pb, * pc;

char * myanchor = 0;

int len = 0;

pa = src;

do {

if((pb = strstr(pa, "href=‘"))) {

pc = strchr(pb + 6, ‘\‘‘);

len = strlen(pb + 6) - strlen(pc);

GetMemory(&myanchor, len);

memcpy(myanchor, pb + 6, len);

}

else if((pb = strstr(pa, "href=\""))) {

pc = strchr(pb + 6, ‘"‘);

len = strlen(pb + 6) - strlen(pc);

GetMemory(&myanchor, len);

memcpy(myanchor, pb + 6, len);

}

else if((pb = strstr(pa, "href="))) {

GetAfterPosWithSlash(pb + 5, &pc);

len = strlen(pb + 5) - strlen(pc);

GetMemory(&myanchor, len);

memcpy(myanchor, pb + 5, len);

}

else {goto __returnLink ;}

/*

if(DEBUG) {

if(strcmp(NodeCurr->dir, "/")) fprintf(stdout, "%s\thttps://%s/%s/%s\n", myanchor, NodeCurr->host, NodeCurr->dir, strcmp(NodeCurr->page, "`")?NodeCurr->page:"");

else fprintf(stdout, "%s\thttps://%s%s%s\n", myanchor, NodeCurr->host, NodeCurr->dir, strcmp(NodeCurr->page, "`")?NodeCurr->page:"");

}

*/

if(strlen(myanchor) > 0) AddChildNode(NodeCurr, myanchor);

if(pc + 1) pa = pc + 1;

}while(pa);

__returnLink:

return;

}

/**************************************************************

功能:为当前节点增加子节点

***************************************************************/

void AddChildNode(WEBNODE * node, char * src)

{

int WebPort, len;

char * WebHost = 0, * PageAddress = 0, * WebDir = 0, * pC = 0;

WEBNODE * NewNode;

char filename[MAXFILENAME + 1] = "";

char IsFromRoot = 0;

if(!src) return;

if(!strncasecmp(src, "mailto:", strlen("mailto:"))) return ;

if(strstr(src, ".css")) return;

if(strstr(src, ".xml")) return;

if(strstr(src, ".ico")) return;

if(strstr(src, ".jpg")) return;

if(strstr(src, ".gif")) return;

if(strstr(src, "javascript:")) return;

if(strstr(src, "+")) return;

ret = GetHost(src, &WebHost, &PageAddress, &WebPort, &WebDir);

if(ret) {

len = strlen(node->host);

GetMemory(&WebHost, len);

strcpy(WebHost, node->host);

WebPort = node->port;

IsFromRoot = !strncmp(src, "/", 1);

if(IsFromRoot && (src + 1)) Rstrchr(src + 1, ‘/‘, &pC);

else if(!IsFromRoot) Rstrchr(src, ‘/‘, &pC);

else pC = 0;

if(pC) {

if(IsFromRoot) len = strlen(src + 1) - strlen(pC);

else len = strlen(src) - strlen(pC) + strlen(node->dir) + 1;

GetMemory(&WebDir, len);

if(IsFromRoot) memcpy(WebDir, src + 1, len);

else {memcpy(WebDir, node->dir, strlen(node->dir)); strcat(WebDir, "/"); memcpy(WebDir + strlen(node->dir) + 1, src, strlen(src) - strlen(pC));}

if(pC + 1) {

len = strlen(pC + 1);

GetMemory(&PageAddress, len);

strcpy(PageAddress, pC + 1);

}

else {

len = 1;

GetMemory(&PageAddress, len);

memcpy(PageAddress, e, len);

}

}

else {

if(IsFromRoot) {

len = 1;

GetMemory(&WebDir, len);

memcpy(WebDir, e + 1, len);

len = strlen(src + 1);

GetMemory(&PageAddress, len);

memcpy(PageAddress, src + 1, len);

}

else {

len = strlen(node->dir);

GetMemory(&WebDir, len);

memcpy(WebDir, node->dir, len);

len = strlen(src);

GetMemory(&PageAddress, len);

memcpy(PgeAddress, src, len);

}

}

}

ret = IsExistWeb(NodeHeader, WebHost, PageAddress, WebPort, WebDir);

if(ret) goto __ReturnAdd;

if(node->child == NULL) NewNode = node->child = (WEBNODE *)malloc(sizeof(WEBNODE));

else NodeTail->pother = NewNode = (WEBNODE *)malloc(sizeof(WEBNODE));

memset(NewNode, 0, sizeof(WEBNODE));

NewNode->host = (char *)malloc(strlen(WebHost) + 1);

memset(NewNode->host, 0, strlen(WebHost) + 1);

NewNode->page = (char *)malloc(strlen(PageAddress) + 1);

memset(NewNode->page, 0, strlen(PageAddress) + 1);

NewNode->dir = (char *)malloc(strlen(WebDir) + 1);

memset(NewNode->dir, 0, strlen(WebDir) + 1);

NewNode->file = (char *)malloc(MAXFILENAME + 1);

memset(NewNode->file, 0, MAXFILENAME + 1);

strcpy(NewNode->host, WebHost);

strcpy(NewNode->page, PageAddress);

strcpy(NewNode->dir, WebDir);

sprintf(filename, "file%05d.html", FileNumber++);

strcpy(NewNode->file, filename);

NewNode->port = WebPort;

NewNode->IsHandled = 0;

NewNode->pother = 0;

NewNode->child = 0;

NodeTail = NewNode;

__ReturnAdd:

free(WebHost); free(PageAddress); free(WebDir);

}

/**************************************************************

功能:检查是否已经处理过的网页

***************************************************************/

int IsExistWeb(WEBNODE * node, char * host, char * page, int port, char * dir)

{

WEBNODE * t;

t = node;

while(t) {

if(!strcmp(t->host, host) && !strcmp(t->page, page) && t->port == port && !strcmp(t->dir, dir)) return 1;

t = t->pother;

}

t = node;

while(t) {

if(t->child) {

ret = IsExistWeb(t->child, host, page, port, dir);

if(ret) return 2;

}

t = t->pother;

}

return 0;

}

编译这个程序:

gcc mailaddrsearch.c -o mailsearcher

输入一个网址作为参数运行一下试试吧:

./mailsearcher/5531748.html

程序首先找出/5531748.html 页面上的邮件地址保存到当前目录下 email.txt 文件里,每行一条记录,格式为邮件地址和出现该邮件地址的网页。然后分析这个页面上出现的网页链接,把各链接作为子节点加入链表,再去处理子节点,重复上述操作。

这只是一个示例程序,并不完善,如果要使其达到实用的目的,还需要让这个程序效率更高点,比如加入 epoll ( 在 2.4 内核中只有 select 了 ) 实现 I/O 多路复用。又比如对每个子节点实现多线程,每个线程处理一个节点。

如果对 I/O 多路复用不熟悉,您可以看一下我这篇文章/5345930.html 里关于 “ Linux 下各类TCP网络服务器的实现源代码”

猜你想看
相关文章

Copyright © 2008 - 2022 版权所有 职场范文网

工业和信息化部 备案号:沪ICP备18009755号-3