Miki GrabWebpage
Miki GrabWebpage
using
using
using
using
using
using
using
System;
System.Collections.Generic;
System.ComponentModel;
System.Data;
System.Drawing;
System.Linq;
System.Text;
System.Windows.Forms;
listView1.Items.Add(itemTemp); // listViewItem
ListView
}
this.listView1.EndUpdate();
//UI
*/
#endregion
#endregion
// 3
List<HtmlContentElement> contentList =
ParseLinksText2(site);
//
List<HtmlContentElement> contentListNonRepeat =
contentList.Distinct().ToList();
// http https
List<HtmlContentElement> contentListDiff =
PureHTTPLink(contentListNonRepeat);
// listView
ShowListView(contentListDiff);
}
//
private void btnGetArticle_Click(object sender, EventArgs e)
{
string webSite = "http://blog.yam.com/fern724&page=";
string Xpath =
@"//div[@class='post_titlediv']/a[@href]";//txtXPath.Text;//"div[@class
='post_titlediv']/a[@href]";
for (int j = 1; j <= 2; j++) //
{
listView1.Items.Clear();
label4.Text = j.ToString(); //
Application.DoEvents();
site = webSite + j.ToString();
txtSite.Text = site;
Application.DoEvents();
//site = txtSite.Text; //
while ((Article = ParseArticle(site, Xpath)) == null) {
;}
ShowListView(Article);
//btnSave.PerformClick();
SavePDF(j);
}
MessageBox.Show("");
}
private void SavePDF(int j) // j
{
if (Article != null)
{
// html
#region html
if (ckBoxHtml.Checked == true)
{
for (int i = 0; i < Article.Count; i++)
//Article.Count
{
SaveWebHTML(Article[i].HtmlLink, i + "." +
Article[i].HtmlLinkText + ".html");
}
}
#endregion
// pdf
#region pdf
if (ckBoxPdf.Checked == true)
{
for (int i = 0; i < Article.Count; i++)
{
string tempSite =
Article[i].HtmlLink.Replace("http://","").Replace("/","");
SaveWebPDF(Article[i].HtmlLink, j + "_" + i +
"." + tempSite + "_" + Article[i].HtmlLinkText.Replace(""", "") +
".pdf"); //"_" + Article[i].HtmlLink.Replace("https://",
"").Replace("//", "_") +
}
// PDF
for (int i = 0; i < Article.Count; i++)
{
//SaveWebPDF(Article[i].HtmlLink, i + "." +
Article[i].HtmlLinkText + ".pdf");
string tempSite =
Article[i].HtmlLink.Replace("http://", "").Replace("/", "");
string temp = j + "_" + i + "." + tempSite +
"_" + Article[i].HtmlLinkText.Replace(""", "") + ".pdf"; //"_" +
Article[i].HtmlLink.Replace("http://", "").Replace("//","_") +
ChangeSavePath(temp.Replace(" ", ""),
Application.StartupPath + @"\WebPDF \" + temp.Replace(" ", ""));
}
}
#endregion
// pdf
#region // pdf
for (int i = 0; i < Article.Count; i++)
{
//SaveWebPDF(Article[i].HtmlLink, i + "." +
Article[i].HtmlLinkText + ".pdf");
//
string tempSite =
Article[i].HtmlLink.Replace("http://", "").Replace("/", "");
string temp = j + "_" + i + "." + tempSite + "_" +
Article[i].HtmlLinkText.Replace(""", "") + ".pdf"; //"_" +
Article[i].HtmlLink.Replace("http://", "").Replace("//","_") +
//ChangeSavePath(temp.Replace(" ", ""),
Application.StartupPath + @"\WebPDF \" + temp.Replace(" ", ""));
AddUrl2PDF(Application.StartupPath + @"\WebPDF
\" + temp.Replace(" ", ""), Article[i].HtmlLink);
}
#endregion
//MessageBox.Show("");
Article.Clear();
}
}
// html pdf
private void btnSave_Click(object sender, EventArgs e)
{
if (Article != null)
{
// html
#region html
if (ckBoxHtml.Checked == true)
{
for (int i = 0; i < Article.Count; i++)
//Article.Count
{
SaveWebHTML(Article[i].HtmlLink, i + "." +
Article[i].HtmlLinkText + ".html");
}
}
#endregion
// pdf
#region pdf
if (ckBoxPdf.Checked == true)
{
for (int i = 0; i < Article.Count; i++)
{
SaveWebPDF(Article[i].HtmlLink, i + "." +
Article[i].HtmlLinkText.Replace(""", "") + ".pdf"); //"_" +
Article[i].HtmlLink.Replace("https://", "").Replace("//", "_") +
}
// PDF
for (int i = 0; i < Article.Count; i++)
{
//SaveWebPDF(Article[i].HtmlLink, i + "." +
Article[i].HtmlLinkText + ".pdf");
string temp = i + "." +
#endregion
// PDF
#region void SaveWebPDF(string html, string filename)
private void SaveWebPDF(string html, string filename)
{
#region ()
/*
HtmlWeb web = new HtmlWeb();
web.AutoDetectEncoding = true;
HtmlAgilityPack.HtmlDocument doc = web.Load(html); //
//doc.Save(filename, doc.Encoding); //
Debug
doc.Save(Application.StartupPath + @"\WebPDF \" +
filename, doc.Encoding); // HTML
*/
//string fileNamePDF = Application.StartupPath + @"\WebPDF
\" + filename;
//fileNamePDF = fileNamePDF.Replace(Application.StartupPath
+ "\\", ""); //
//string site = html; //
#endregion
filename = filename.Replace(" ", "");
// cmd phantomjs rasterize.js
https://tw.news.yahoo.com/locam() PhantomjsTest4.pdf()
string command = "phantomjs" + " rasterize2.js" + " " +
html + " " + filename; //0.ASP.NET 2.0
//filename;//@"\WebPDF
\1.pdf";//fileNamePDF;
ExecuteCmd(command);
}
/// <summary>
/// cmd
/// </summary>
/// <param name="command"></param>
private void ExecuteCmd(string command)
{
System.Diagnostics.Process p = new
System.Diagnostics.Process(); // process
p.StartInfo.FileName = "cmd.exe"; // cmd
p.StartInfo.Arguments = @"/C " + command; // cmd
p.StartInfo.UseShellExecute = false;
p.StartInfo.CreateNoWindow = true; //
p.Start(); //
p.WaitForExit(); //
}
#endregion
#region listView
private void listView1_ItemActivate(object sender, EventArgs e)
{
ListView listview = (ListView)sender;
// row
string s1 = listview.SelectedItems[0].SubItems[1].Text; //
// Uri
Uri url;
bool tag = false;
try
{
url = new Uri(s1);
//
/*
// url ie
System.Diagnostics.Process.Start("IExplore",
url.AbsoluteUri);
tag = Uri.CheckSchemeName(url.Scheme); //
MessageBox.Show(tag.ToString());
*/
//
try
{
System.Diagnostics.Process.Start(url.AbsoluteUri);
//System.Diagnostics.Process.Start("")
}
catch (Exception ex)
{
MessageBox.Show(ex.ToString());
}
}
catch (Exception ex)
{
//tag = Uri.CheckSchemeName(url.Scheme); //
MessageBox.Show(tag.ToString());
MessageBox.Show(ex.ToString());
}
}
#endregion
// =========================
//
#region List<string> ParseLinks(string html)
private List<string> ParseLinks(string html)
{
HtmlWeb web = new HtmlWeb();
HtmlAgilityPack.HtmlDocument doc = web.Load(html);
HtmlNodeCollection nodes =
doc.DocumentNode.SelectNodes("//a[@href]");//(xpath);
List<string> temp = new List<string>();
foreach (HtmlNode node in nodes)
{
if (node == null) continue;
string i = node.Attributes["href"].Value;
//
//string i2 = node.GetAttributeValue("href","");
temp.Add(i);
}
return temp;
}
#endregion
//
#region private List<string> ParseLinksText(string html,ref
List<string> LinkText)
private List<string> ParseLinksText(string html, ref
List<string> LinkText)
{
HtmlWeb web = new HtmlWeb();
HtmlAgilityPack.HtmlDocument doc = web.Load(html);
HtmlNodeCollection nodes =
doc.DocumentNode.SelectNodes("//a[@href]");//(xpath);
List<string> temp = new List<string>();
foreach (HtmlNode node in nodes)
{
if (node == null) continue;
string i = node.Attributes["href"].Value;
string text = node.InnerText;
//
//string i2 = node.GetAttributeValue("href","");
temp.Add(i);
LinkText.Add(text);
}
return temp;
}
#endregion
// HtmlContent
#region private HtmlContent ParseLinksText(string html)
private HtmlContent ParseLinksText(string html)
{
HtmlWeb web = new HtmlWeb();
HtmlAgilityPack.HtmlDocument doc = web.Load(html); //
HtmlNodeCollection nodes =
doc.DocumentNode.SelectNodes("//a[@href]");//(xpath); //
(node)
HtmlContent hcontents = new
HtmlContent();//GetLinksAndText(nodes); // function
// link HtmlContent
for (int i = 0; i < nodes.Count; i++)
{
hcontents.HtmlLinks.Add(nodes[i].Attributes["href"].Value);
hcontents.HtmlLinkTexts.Add(nodes[i].InnerText);
}
return hcontents;
}
#endregion
// HtmlContentElement (
)
#region private List<HtmlContentElement> ParseLinksText2(string
html)
private List<HtmlContentElement> ParseLinksText2(string html)
{
List<HtmlContentElement> hconElementOutput = new
List<HtmlContentElement>();
HtmlWeb web = new HtmlWeb();
web.AutoDetectEncoding = true;
HtmlAgilityPack.HtmlDocument doc = web.Load(html); //
Encoding e = doc.Encoding; //
HtmlNodeCollection nodes =
doc.DocumentNode.SelectNodes("//a[@href]");//(xpath); //
(node)
foreach (HtmlNode node in nodes)
{
HtmlContentElement hconElement = new
HtmlContentElement();
hconElement.HtmlLink = node.Attributes["href"].Value;
hconElement.HtmlLinkText = node.InnerText;
hconElementOutput.Add(hconElement);
}
return hconElementOutput;
}
#endregion
// http https
#region
private List<string> CheckValidHTTPLink(List<string>
input)
private List<string> CheckValidHTTPLink(List<string> input)
{
List<string> validLinks = new List<string>();
foreach (string s in input)
{
if (s.Contains("http://"))
{
validLinks.Add(s.Substring(s.IndexOf("http://")));
// http://( http://)
}
else if (s.Contains("https://"))
{
validLinks.Add(s.Substring(s.IndexOf("https://")));
// http://( https://)
}
}
return validLinks;
}
#endregion
// pure http
#region private List<HtmlContentElement>
PureHTTPLink(List<HtmlContentElement> inputLinkList)
private List<HtmlContentElement>
PureHTTPLink(List<HtmlContentElement> inputLinkList)
{
List<HtmlContentElement> outputLinkList = new
List<HtmlContentElement>();
foreach (HtmlContentElement content in inputLinkList)
{
if (content.HtmlLink.Contains("http://"))
{
content.HtmlLink =
content.HtmlLink.Substring(content.HtmlLink.IndexOf("http://")); //
http
outputLinkList.Add(content);
}
else if (content.HtmlLink.Contains("https://"))
{
content.HtmlLink =
content.HtmlLink.Substring(content.HtmlLink.IndexOf("https://")); //
http
outputLinkList.Add(content);
}
}
return outputLinkList;
}
#endregion
// http
#region private List<HtmlContentElement> ParseArticle(string
html,string Xpath)
private List<HtmlContentElement> ParseArticle(string
html,string Xpath)
{
List<HtmlContentElement> hconElementOutput = new
List<HtmlContentElement>();
HtmlWeb web = new HtmlWeb();
web.AutoDetectEncoding = true;
HtmlAgilityPack.HtmlDocument doc = web.Load(html); //
Encoding e = doc.Encoding; //
//Xpath = "//a[@href]"; // (a href a //
a ( href ))
//Xpath = "//div[@class='post-body entry-
content']/a[@href]";
HtmlNodeCollection nodes =
doc.DocumentNode.SelectNodes(Xpath);//(xpath); // (node)
this.listView1.EndUpdate();
//UI
}
#endregion
//
#region void ChangeSavePath(string sourcePath, string
TargetPath)
private void ChangeSavePath(string sourcePath, string
TargetPath)
{
byte[] bytes = new byte[0];
using (FileStream fsR = new FileStream(sourcePath,
FileMode.Open, FileAccess.Read))
{
bytes = new byte[fsR.Length];
fsR.Read(bytes, 0, (int)fsR.Length);// bytes
using (FileStream fsW = new FileStream(TargetPath,
FileMode.Create, FileAccess.Write)) //, FileAccess.Write
{
fsW.Write(bytes, 0, (int)fsR.Length);
}
}
System.IO.File.Delete(sourcePath); //
}
#endregion
// pdf
#region public void AddUrl2PDF(string filepath, string url)
public void AddUrl2PDF(string filepath, string url)
{
string oldFile = filepath;//"0.ASP.NET2.0 Excel
.pdf";//"oldFile.pdf";
//string temp = "0.ASP.NET2.0 Excel
.pdf";
string newFile = "NewFile.pdf";//temp;
PdfReader reader = new PdfReader(oldFile);
iTextSharp.text.Rectangle dimension =
reader.GetPageSize(1);
iTextSharp.text.Rectangle size =
reader.GetPageSizeWithRotation(1);
Document document = new Document(size);
//===================================================
}
}