频道直达 - 专题 - 新闻 - 技巧 - 组网 - 开发 - 安全 - web编程 - 图像 - 操作系统 - 数据库 - 教育 - 旅游 - 健康 - 时尚 - 驱动 - 软件 - 游戏 - 多媒体 - ERP - 讨论组

利用C#实现web信息自动抓取

来源:qqread 作者: 出处:巧巧读书 2007-06-01 进入讨论组
获取页面内容后,分析页面中连接地址取到要抓取的url:
//处理页面标题和链接
        public string SniffWebUrl( string urlStr,string blockB,string blockE )
        {      
            string urlch1 = "";
            string urlch2 = "";                   
            int end_n1 = 0;
            int end_nums = 0;
            int end_nums1 = 0;
            int end_nums2 = 0;
            int end_nums3     = 0;           
            string reUTStr = "";
            string reTitle = "";
            string ret = "";          
            try
            {
                int pos01 = urlStr.IndexOf( "." );
                int pos02 = urlStr.LastIndexOf( "/" );
                if( pos01 < 0 )
                {
                    return "";
                }
                if( pos02 < 0 )
                {
                    return "";
                }
                int pos03 = urlStr.IndexOf( "/",pos01 );
                if ( pos03 < 0 )
                {
                    urlch1 = urlStr;
                    urlch2 = urlStr;
                }
                else
                {
                    urlch1 = urlStr.Substring( 0,pos03 );
                    urlch2 = urlStr.Substring( 0,pos02 );
                }
 
                string tmpAllStr = new PublicFun().Get_Http( urlStr ,time1);
 
                int pos1 = tmpAllStr.IndexOf( blockB );
                int pos2 = tmpAllStr.IndexOf( blockE,pos1 + blockB.Length );
                if ( pos1>0 && pos2>0 && pos2>pos1 )
                {
                    ret = tmpAllStr.Substring( pos1 + blockB.Length,pos2 - pos1 - blockB.Length );
                    ret = ret.Substring( ret.IndexOf( "<" ));
                    while( ret.IndexOf( "<A" ) >= 0 )
                    {
                        ret = ret.Substring( 0,ret.IndexOf( "<A" ) ) + "<a" + ret.Substring( ret.IndexOf( "<A" ) + 2 );
                    }
                    while( ret.IndexOf( "</A" ) >=0 )
                    {
                        ret = ret.Substring( 0,ret.IndexOf( "</A" ) ) + "</a" + ret.Substring( ret.IndexOf( "</A" ) + 3 );
                    }
                    while( ret.IndexOf( "Href=" ) >=0 )
                    {
                        ret = ret.Substring( 0,ret.IndexOf( "Href=" )) + "href=" + ret.Substring( ret.IndexOf( "Href=" ) + 5 );
                    }
                    while( ret.IndexOf( "HREF=" ) >=0 )
                    {
                        ret = ret.Substring( 0,ret.IndexOf( "HREF=" )) + "href=" + ret.Substring( ret.IndexOf( "HREF=" ) + 5 );
                    }
                    while( ret.IndexOf( "href='" ) >=0 )
                    {
                        ret = ret.Substring( 0,ret.IndexOf( "href='" )) + "href=\"" + ret.Substring( ret.IndexOf( "href='" ) + 6 );
                    }
                }      
                tmpAllStr = ret;     
                int begin_nums = tmpAllStr.IndexOf( "href=" );
 
                while ( begin_nums >= 0 )
                {              
                    string tmpStrA = "";
                    string tmpStrB = tmpAllStr.Substring( begin_nums + 5,1 );
                    if ( tmpStrB == "\"" )
                    {
                        end_n1 = begin_nums + 6;
                        if ( ( end_n1 + 1 ) > tmpAllStr.Length )
                        {
                            return "";
                        }
                        tmpStrA = tmpAllStr.Substring( begin_nums+6,1 );
                    }
                    else
                    {
                        end_n1 = begin_nums + 5;
                        tmpStrA = tmpStrB;
                    }
 
                    if ( tmpStrA == "#" )
                    {
                        tmpAllStr = tmpAllStr.Substring( end_n1 );
                        begin_nums = tmpAllStr.IndexOf( "href=" );
                    }
                    else
                    {                  
                        end_nums1 = tmpAllStr.IndexOf( " ",end_n1 );
                        end_nums2 = tmpAllStr.IndexOf( ">",end_n1 );
                        end_nums3 = tmpAllStr.IndexOf( "</a",end_nums2 );
 
                        if ( ( end_nums3 >= 0 ) && ( end_nums2 >= 0 ) )
                        {
                            reTitle = tmpAllStr.Substring( end_nums2 + 1,end_nums3 - end_nums2 - 1 );
 
                            if ( end_nums1 > end_nums2 )
                            {
                                end_nums = end_nums2;
                            }
                            else
                            {
                                if ( end_nums1 < 0 )
                                {
                                    end_nums = end_nums2;
                                }
                                else
                                {
                                    end_nums = end_nums1;
                                }
                            }
                            string str4 = tmpAllStr.Substring( end_nums-1, end_nums - end_nums + 1 );
 
                            if ( str4 =="\"" || str4 == "'" )
                            {
                                end_nums = end_nums - 1;
                            }
                            string sTotalOne = tmpAllStr.Substring( end_n1,end_nums - end_n1 );
 
                            if ( sTotalOne.IndexOf( "http://" ) <0 )
                            {
                                if ( sTotalOne.IndexOf( "/" ) == 0 )
                                {
                                    sTotalOne = urlch1 + sTotalOne;
                                }
                                else
                                {                              
                                    int linshiIntNum = 0;
                                    int flags = 0;
                                    string urlChange = urlStr;;
                                    while( sTotalOne.IndexOf( "../" ) >= 0 )
                                    {
                                        sTotalOne = sTotalOne.Substring( sTotalOne.IndexOf( "../" ) + 3 );
                                        linshiIntNum = linshiIntNum + 1;
                                        flags = flags +1;
                                    }
                                    while( ( urlChange.LastIndexOf( "/" ) >= 0 ) && ( linshiIntNum >= 0 ) )
                                    {
                                        urlChange = urlChange.Substring( 0,urlChange.LastIndexOf( "/" ) );
                                        linshiIntNum = linshiIntNum - 1;
                                    }
                                    if ( flags == 0 )
                                    {
                                        sTotalOne = urlch2 + "/" + sTotalOne;
                                    }
                                    else
                                    {
                                        sTotalOne = urlChange + "/" + sTotalOne;
                                    }
                                }
                            }
                            reUTStr = reUTStr + new PublicFun().RemoveHtmlCode( reTitle ) + sTotalOne;
 
                            tmpAllStr = tmpAllStr.Substring( end_nums3 + 4 );
                            begin_nums = tmpAllStr.IndexOf( "href=" );
                        }
                        else
                        {
                            begin_nums = -1;
                        }                   
                    }
                }
                return reUTStr;
            }
            catch( Exception e)
            {
                return "";
            }
        }
 

   巧巧读书:http://www.qqread.com/csharp/k314900.html

进入讨论组讨论。
收藏此文】【 】【打印】【关闭
相关图文阅读
频道图文推荐
健 康 咨 询
时 尚 咨 询
巧巧读书宗旨
相关专题
讨论组问题推荐
站内各频道最新更新文档
站内最新制作专题
热门关键字导读
Photoshop教 程照片处理 照片制作 PS快捷键 抠图
计 算 机 故 障XP系统修复
艺 术 与 设 计设计 流媒体 设计欣赏 边框
计 算 机 安 全ARP
站内频道文章精选
巧巧电脑频道编辑信箱  告诉我们您想看的专题或文章