获取页面内容后,分析页面中连接地址取到要抓取的url:
//处理页面标题和链接
public string SniffWebUrl( string urlStr,string blockB,string blockE )
{
string urlch1 = "";
string urlch2 = "";
int end_n1 = 0;
int end_nums = 0;
int end_nums1 = 0;
int end_nums2 = 0;
int end_nums3 = 0;
string reUTStr = "";
string reTitle = "";
string ret = "";
try
{
int pos01 = urlStr.IndexOf( "." );
int pos02 = urlStr.LastIndexOf( "/" );
if( pos01 < 0 )
{
return "";
}
if( pos02 < 0 )
{
return "";
}
int pos03 = urlStr.IndexOf( "/",pos01 );
if ( pos03 < 0 )
{
urlch1 = urlStr;
urlch2 = urlStr;
}
else
{
urlch1 = urlStr.Substring( 0,pos03 );
urlch2 = urlStr.Substring( 0,pos02 );
}
string tmpAllStr = new PublicFun().Get_Http( urlStr ,time1);
int pos1 = tmpAllStr.IndexOf( blockB );
int pos2 = tmpAllStr.IndexOf( blockE,pos1 + blockB.Length );
if ( pos1>0 && pos2>0 && pos2>pos1 )
{
ret = tmpAllStr.Substring( pos1 + blockB.Length,pos2 - pos1 - blockB.Length );
ret = ret.Substring( ret.IndexOf( "<" ));
while( ret.IndexOf( "<A" ) >= 0 )
{
ret = ret.Substring( 0,ret.IndexOf( "<A" ) ) + "<a" + ret.Substring( ret.IndexOf( "<A" ) + 2 );
}
while( ret.IndexOf( "</A" ) >=0 )
{
ret = ret.Substring( 0,ret.IndexOf( "</A" ) ) + "</a" + ret.Substring( ret.IndexOf( "</A" ) + 3 );
}
while( ret.IndexOf( "Href=" ) >=0 )
{
ret = ret.Substring( 0,ret.IndexOf( "Href=" )) + "href=" + ret.Substring( ret.IndexOf( "Href=" ) + 5 );
}
while( ret.IndexOf( "HREF=" ) >=0 )
{
ret = ret.Substring( 0,ret.IndexOf( "HREF=" )) + "href=" + ret.Substring( ret.IndexOf( "HREF=" ) + 5 );
}
while( ret.IndexOf( "href='" ) >=0 )
{
ret = ret.Substring( 0,ret.IndexOf( "href='" )) + "href=\"" + ret.Substring( ret.IndexOf( "href='" ) + 6 );
}
}
tmpAllStr = ret;
int begin_nums = tmpAllStr.IndexOf( "href=" );
while ( begin_nums >= 0 )
{
string tmpStrA = "";
string tmpStrB = tmpAllStr.Substring( begin_nums + 5,1 );
if ( tmpStrB == "\"" )
{
end_n1 = begin_nums + 6;
if ( ( end_n1 + 1 ) > tmpAllStr.Length )
{
return "";
}
tmpStrA = tmpAllStr.Substring( begin_nums+6,1 );
}
else
{
end_n1 = begin_nums + 5;
tmpStrA = tmpStrB;
}
if ( tmpStrA == "#" )
{
tmpAllStr = tmpAllStr.Substring( end_n1 );
begin_nums = tmpAllStr.IndexOf( "href=" );
}
else
{
end_nums1 = tmpAllStr.IndexOf( " ",end_n1 );
end_nums2 = tmpAllStr.IndexOf( ">",end_n1 );
end_nums3 = tmpAllStr.IndexOf( "</a",end_nums2 );
if ( ( end_nums3 >= 0 ) && ( end_nums2 >= 0 ) )
{
reTitle = tmpAllStr.Substring( end_nums2 + 1,end_nums3 - end_nums2 - 1 );
if ( end_nums1 > end_nums2 )
{
end_nums = end_nums2;
}
else
{
if ( end_nums1 < 0 )
{
end_nums = end_nums2;
}
else
{
end_nums = end_nums1;
}
}
string str4 = tmpAllStr.Substring( end_nums-1, end_nums - end_nums + 1 );
if ( str4 =="\"" || str4 == "'" )
{
end_nums = end_nums - 1;
}
string sTotalOne = tmpAllStr.Substring( end_n1,end_nums - end_n1 );
if ( sTotalOne.IndexOf( "http://" ) <0 )
{
if ( sTotalOne.IndexOf( "/" ) == 0 )
{
sTotalOne = urlch1 + sTotalOne;
}
else
{
int linshiIntNum = 0;
int flags = 0;
string urlChange = urlStr;;
while( sTotalOne.IndexOf( "../" ) >= 0 )
{
sTotalOne = sTotalOne.Substring( sTotalOne.IndexOf( "../" ) + 3 );
linshiIntNum = linshiIntNum + 1;
flags = flags +1;
}
while( ( urlChange.LastIndexOf( "/" ) >= 0 ) && ( linshiIntNum >= 0 ) )
{
urlChange = urlChange.Substring( 0,urlChange.LastIndexOf( "/" ) );
linshiIntNum = linshiIntNum - 1;
}
if ( flags == 0 )
{
sTotalOne = urlch2 + "/" + sTotalOne;
}
else
{
sTotalOne = urlChange + "/" + sTotalOne;
}
}
}
reUTStr = reUTStr + new PublicFun().RemoveHtmlCode( reTitle ) + sTotalOne;
tmpAllStr = tmpAllStr.Substring( end_nums3 + 4 );
begin_nums = tmpAllStr.IndexOf( "href=" );
}
else
{
begin_nums = -1;
}
}
}
return reUTStr;
}
catch( Exception e)
{
return "";
}
}
- 用Photoshop给漂亮的烫发MM抠图
- Photoshop透明婚纱抠图大法
- Photoshop:让MM做个“变色龙”
- 用Photoshop来制作一款精美的宝宝照片墙
- Photoshop绝色美女通道抠图法
- 用Photoshop教你打造绚丽光芒效果
巧巧读书:http://www.qqread.com/csharp/k314900.html
进入讨论组讨论。相关图文阅读
频道图文推荐
健 康 咨 询
时 尚 咨 询
相关专题
- C#变得越来越臃肿是不可避免的? (7次浏览)
- C# 3.0新特性之扩展方法 (1次浏览)
- 实验分析C#中三种计时器使用异同点 (0次浏览)
- C#调用QTP自动化对象模型的实例 (0次浏览)
- 利用 C# 实现任务栏通知窗口 (0次浏览)
- 深入C#学习系列之不可小瞧的using关键字 (0次浏览)
- 如何用.NET技术在线生成网站LOGO (0次浏览)
- C#实现遗传算法 模拟花朵的进化 (0次浏览)
- c#里的运算符重载 (0次浏览)
- QTP调用自己开发的.net类库 (0次浏览)



