our company's site use many iframes from remote server.it not a good way to visitor.So i write a programme whick could cacth the remote web pages and save the HTML code to local file.but the pages always use relative path to load images and refer to some path,so i programme a convert function to filter the code that get from remote before save it to disk.
list the source:
//从远端URI获取网页写入本地
//替换SRC,HREF,IMG相对地址为绝对地址
//返回执行结果:
//-1:网络问题
//-2:写文件的时候出错
public static short getFileFromRemote(string uri,string filePath)
{
string uriBase=uri.Substring(0,uri.LastIndexOf("/")+1);
string tempCode="";
//读取源端的页面
try
{
WebRequest request = WebRequest.Create(uri);
//请求服务
WebResponse response = request.GetResponse();
//返回信息
Stream resStream = response.GetResponseStream();
StreamReader sr = new StreamReader(resStream, Encoding.GetEncoding("GB2312"));
tempCode= sr.ReadToEnd();
resStream.Close();
sr.Close();
}
catch
{
return -1;
}
//替换相对路径为绝对路径
tempCode=convertPath(tempCode,uriBase);
//写文件
try{
if(File.Exists(filePath)){
File.Delete(filePath);
}
StreamWriter sw = new StreamWriter(filePath,true,Encoding.GetEncoding("GB2312"));
sw.Write(tempCode);
sw.Flush();
sw.Close();
}
catch{
return -2;
}
return 1;
}
//将HTML页面中的连接和图片的相对路径转换为绝对路径
public static string convertPath(string tempCode,string uriBase)
{
//最小化源串
tempCode=tempCode.ToLower();
//设定要转换的标签
string[] targs=new string[] {"href","src","background"};
//循环转换
for(int i=0;i { string newCode=""; //目标串 Regex reg=new Regex(targs[i]);//将标签转化为正则型变量 MatchCollection mc=reg.Matches(tempCode);//找出所有的匹配的标签的位置 int curPos=0;//设定指针 //循环替换找到的标签 for(int j=0;j { int leftPos=mc[j].Index;//目标标签的左位置 int rightPos=mc[j].Index+targs[i].Length;//目标标签的右位置 //将目标标签右边的空格算入目标标签 while(tempCode.Substring(rightPos,1)==" ") rightPos++; //将目标标签右边的等号算入目标标签,如果没有找到等号开始下一个循环 if(tempCode.Substring(rightPos++,1)=="=") { //将目标标签右边的空格算入目标标签 while(tempCode.Substring(rightPos,1)==" ") { rightPos++; } //将目标标签右边的双引号和单引号算入目标标签 if(tempCode.Substring(rightPos,1)=="\"" || tempCode.Substring(rightPos,1)=="\'") { rightPos++; } //将目标标签右边的空格算入目标标签 while(tempCode.Substring(rightPos,1)==" ") rightPos++; //如果已经是相对路径,不做操作 if(tempCode.Substring(rightPos,7)!="http://") { newCode+=tempCode.Substring(curPos,rightPos-curPos)+uriBase; curPos=rightPos; } } //如果是最后一个循环,将源串的尾巴加入新串 if(j==mc.Count-1) newCode+=tempCode.Substring(curPos); } //让新串成为源串,对新串进行下一轮替换 tempCode=newCode; } return tempCode; } //end of convertPath()

