Miki GrabWebpage

This document contains code for a C# application that grabs articles from web pages. It uses HTML Agility Pack to parse HTML and retrieve links and text from web pages. It can save the retrieved articles to HTML, PDF, and add URLs to the PDF files. Key functions include parsing links and text from a page, saving to various file formats, and displaying results in a list view. The code implements multiple methods of retrieving, parsing, saving and presenting web page content.

Uploaded by

roy721225

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

82 views

Miki GrabWebpage

Uploaded by

roy721225

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

You are on page 1/ 16

using

using
using
using
using
using
using
using

System;
System.Collections.Generic;
System.ComponentModel;
System.Data;
System.Drawing;
System.Linq;
System.Text;
System.Windows.Forms;

using HtmlAgilityPack; // DOM

using System.IO;
using iTextSharp.text;
using iTextSharp.text.pdf;
using System.Diagnostics;
namespace Miki_GrabWebPageArticle
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
//=====================
string site = "";
List<HtmlContentElement> Article;
//=========== END=======
//
private void btnGetHref_Click(object sender, EventArgs e)
{
listView1.Items.Clear();
site = txtSite.Text; //
if (!site.Contains("http"))
{
site = "http://" + site;
}
// 2
#region HtmlConten
//HtmlContent hcontent = ParseLinksText(site);

#region listView for 2

/*
this.listView1.BeginUpdate();
//UI
EndUpdate
// listView ()
listView1.View = View.Details;
listView1.Columns.Add("");
listView1.Columns.Add("");
listView1.Columns[1].Width = 600; // 1
for (int i = 0; i < hcontent.HtmlLinks.Count; i++)
{
ListViewItem itemTemp = new ListViewItem();
itemTemp.Text =
hcontent.HtmlLinkTexts[i];//linkText[i];//"" + i; // 1
itemTemp.SubItems.Add(hcontent.HtmlLinks[i]); // 2

listView1.Items.Add(itemTemp); // listViewItem
ListView
}
this.listView1.EndUpdate();

//UI

*/
#endregion
#endregion
// 3
List<HtmlContentElement> contentList =
ParseLinksText2(site);
//
List<HtmlContentElement> contentListNonRepeat =
contentList.Distinct().ToList();
// http https
List<HtmlContentElement> contentListDiff =
PureHTTPLink(contentListNonRepeat);
// listView
ShowListView(contentListDiff);
}

//
private void btnGetArticle_Click(object sender, EventArgs e)
{
string webSite = "http://blog.yam.com/fern724&page=";
string Xpath =
@"//div[@class='post_titlediv']/a[@href]";//txtXPath.Text;//"div[@class
='post_titlediv']/a[@href]";
for (int j = 1; j <= 2; j++) //
{
listView1.Items.Clear();
label4.Text = j.ToString(); //
Application.DoEvents();
site = webSite + j.ToString();
txtSite.Text = site;
Application.DoEvents();
//site = txtSite.Text; //
while ((Article = ParseArticle(site, Xpath)) == null) {
;}
ShowListView(Article);
//btnSave.PerformClick();
SavePDF(j);
}
MessageBox.Show("");
}
private void SavePDF(int j) // j
{
if (Article != null)
{
// html
#region html
if (ckBoxHtml.Checked == true)
{
for (int i = 0; i < Article.Count; i++)
//Article.Count
{
SaveWebHTML(Article[i].HtmlLink, i + "." +
Article[i].HtmlLinkText + ".html");
}
}

#endregion
// pdf
#region pdf
if (ckBoxPdf.Checked == true)
{
for (int i = 0; i < Article.Count; i++)
{
string tempSite =
Article[i].HtmlLink.Replace("http://","").Replace("/","");
SaveWebPDF(Article[i].HtmlLink, j + "_" + i +
"." + tempSite + "_" + Article[i].HtmlLinkText.Replace("&quot", "") +
".pdf"); //"_" + Article[i].HtmlLink.Replace("https://",
"").Replace("//", "_") +
}
// PDF
for (int i = 0; i < Article.Count; i++)
{
//SaveWebPDF(Article[i].HtmlLink, i + "." +
Article[i].HtmlLinkText + ".pdf");
string tempSite =
Article[i].HtmlLink.Replace("http://", "").Replace("/", "");
string temp = j + "_" + i + "." + tempSite +
"_" + Article[i].HtmlLinkText.Replace("&quot", "") + ".pdf"; //"_" +
Article[i].HtmlLink.Replace("http://", "").Replace("//","_") +
ChangeSavePath(temp.Replace(" ", ""),
Application.StartupPath + @"\WebPDF \" + temp.Replace(" ", ""));
}
}
#endregion
// pdf
#region // pdf
for (int i = 0; i < Article.Count; i++)
{
//SaveWebPDF(Article[i].HtmlLink, i + "." +
Article[i].HtmlLinkText + ".pdf");
//
string tempSite =
Article[i].HtmlLink.Replace("http://", "").Replace("/", "");
string temp = j + "_" + i + "." + tempSite + "_" +
Article[i].HtmlLinkText.Replace("&quot", "") + ".pdf"; //"_" +
Article[i].HtmlLink.Replace("http://", "").Replace("//","_") +
//ChangeSavePath(temp.Replace(" ", ""),
Application.StartupPath + @"\WebPDF \" + temp.Replace(" ", ""));

AddUrl2PDF(Application.StartupPath + @"\WebPDF
\" + temp.Replace(" ", ""), Article[i].HtmlLink);
}
#endregion
//MessageBox.Show("");
Article.Clear();
}
}
// html pdf
private void btnSave_Click(object sender, EventArgs e)
{
if (Article != null)
{
// html
#region html
if (ckBoxHtml.Checked == true)
{
for (int i = 0; i < Article.Count; i++)
//Article.Count
{
SaveWebHTML(Article[i].HtmlLink, i + "." +
Article[i].HtmlLinkText + ".html");
}
}
#endregion
// pdf
#region pdf
if (ckBoxPdf.Checked == true)
{
for (int i = 0; i < Article.Count; i++)
{
SaveWebPDF(Article[i].HtmlLink, i + "." +
Article[i].HtmlLinkText.Replace("&quot", "") + ".pdf"); //"_" +
Article[i].HtmlLink.Replace("https://", "").Replace("//", "_") +
}
// PDF
for (int i = 0; i < Article.Count; i++)
{
//SaveWebPDF(Article[i].HtmlLink, i + "." +
Article[i].HtmlLinkText + ".pdf");
string temp = i + "." +

Article[i].HtmlLinkText.Replace("&quot", "") + ".pdf"; //"_" +

Article[i].HtmlLink.Replace("http://", "").Replace("//","_") +
ChangeSavePath(temp.Replace(" ", ""),
Application.StartupPath + @"\WebPDF \" + temp.Replace(" ", ""));
}
}
#endregion
// pdf
#region // pdf
for (int i = 0; i < Article.Count; i++)
{
//SaveWebPDF(Article[i].HtmlLink, i + "." +
Article[i].HtmlLinkText + ".pdf");
//
string temp = i + "." +
Article[i].HtmlLinkText.Replace("&quot", "") + ".pdf"; //"_" +
Article[i].HtmlLink.Replace("http://", "").Replace("//","_") +
//ChangeSavePath(temp.Replace(" ", ""),
Application.StartupPath + @"\WebPDF \" + temp.Replace(" ", ""));
AddUrl2PDF(Application.StartupPath + @"\WebPDF
\" + temp.Replace(" ", ""), Article[i].HtmlLink);
}
#endregion
//MessageBox.Show("");
}
}
// HTML
#region void SaveWebHTML(string html, string filename)
private void SaveWebHTML(string html, string filename)
{
HtmlWeb web = new HtmlWeb();
web.AutoDetectEncoding = true;
HtmlAgilityPack.HtmlDocument doc = web.Load(html); //
//doc.Save(filename, doc.Encoding); //
Debug
//doc.Save(Application.StartupPath + @"\HTML \" +
filename, doc.Encoding); // HTML
doc.Save(Application.StartupPath + @"\WebHTML \" +
filename, doc.Encoding); // HTML
}

#endregion
// PDF
#region void SaveWebPDF(string html, string filename)
private void SaveWebPDF(string html, string filename)
{
#region ()
/*
HtmlWeb web = new HtmlWeb();
web.AutoDetectEncoding = true;
HtmlAgilityPack.HtmlDocument doc = web.Load(html); //
//doc.Save(filename, doc.Encoding); //
Debug
doc.Save(Application.StartupPath + @"\WebPDF \" +
filename, doc.Encoding); // HTML
*/
//string fileNamePDF = Application.StartupPath + @"\WebPDF
\" + filename;
//fileNamePDF = fileNamePDF.Replace(Application.StartupPath
+ "\\", ""); //
//string site = html; //
#endregion
filename = filename.Replace(" ", "");
// cmd phantomjs rasterize.js
https://tw.news.yahoo.com/locam() PhantomjsTest4.pdf()
string command = "phantomjs" + " rasterize2.js" + " " +
html + " " + filename; //0.ASP.NET 2.0
//filename;//@"\WebPDF
\1.pdf";//fileNamePDF;
ExecuteCmd(command);
}
/// <summary>
/// cmd
/// </summary>
/// <param name="command"></param>
private void ExecuteCmd(string command)
{
System.Diagnostics.Process p = new
System.Diagnostics.Process(); // process
p.StartInfo.FileName = "cmd.exe"; // cmd
p.StartInfo.Arguments = @"/C " + command; // cmd

p.StartInfo.UseShellExecute = false;
p.StartInfo.CreateNoWindow = true; //

p.Start(); //
p.WaitForExit(); //
}
#endregion
#region listView
private void listView1_ItemActivate(object sender, EventArgs e)
{
ListView listview = (ListView)sender;
// row
string s1 = listview.SelectedItems[0].SubItems[1].Text; //

// Uri
Uri url;
bool tag = false;
try
{
url = new Uri(s1);
//
/*
// url ie
System.Diagnostics.Process.Start("IExplore",
url.AbsoluteUri);
tag = Uri.CheckSchemeName(url.Scheme); //
MessageBox.Show(tag.ToString());
*/
//
try
{
System.Diagnostics.Process.Start(url.AbsoluteUri);
//System.Diagnostics.Process.Start("")
}
catch (Exception ex)
{
MessageBox.Show(ex.ToString());
}
}
catch (Exception ex)
{
//tag = Uri.CheckSchemeName(url.Scheme); //
MessageBox.Show(tag.ToString());

MessageBox.Show(ex.ToString());
}
}
#endregion
// =========================
//
#region List<string> ParseLinks(string html)
private List<string> ParseLinks(string html)
{
HtmlWeb web = new HtmlWeb();
HtmlAgilityPack.HtmlDocument doc = web.Load(html);
HtmlNodeCollection nodes =
doc.DocumentNode.SelectNodes("//a[@href]");//(xpath);
List<string> temp = new List<string>();
foreach (HtmlNode node in nodes)
{
if (node == null) continue;
string i = node.Attributes["href"].Value;
//
//string i2 = node.GetAttributeValue("href","");
temp.Add(i);
}
return temp;
}
#endregion
//
#region private List<string> ParseLinksText(string html,ref
List<string> LinkText)
private List<string> ParseLinksText(string html, ref
List<string> LinkText)
{
HtmlWeb web = new HtmlWeb();
HtmlAgilityPack.HtmlDocument doc = web.Load(html);
HtmlNodeCollection nodes =
doc.DocumentNode.SelectNodes("//a[@href]");//(xpath);
List<string> temp = new List<string>();
foreach (HtmlNode node in nodes)
{
if (node == null) continue;

string i = node.Attributes["href"].Value;
string text = node.InnerText;
//
//string i2 = node.GetAttributeValue("href","");
temp.Add(i);
LinkText.Add(text);
}
return temp;
}
#endregion
// HtmlContent
#region private HtmlContent ParseLinksText(string html)
private HtmlContent ParseLinksText(string html)
{
HtmlWeb web = new HtmlWeb();
HtmlAgilityPack.HtmlDocument doc = web.Load(html); //

HtmlNodeCollection nodes =
doc.DocumentNode.SelectNodes("//a[@href]");//(xpath); //
(node)
HtmlContent hcontents = new
HtmlContent();//GetLinksAndText(nodes); // function

// link HtmlContent
for (int i = 0; i < nodes.Count; i++)
{
hcontents.HtmlLinks.Add(nodes[i].Attributes["href"].Value);
hcontents.HtmlLinkTexts.Add(nodes[i].InnerText);
}
return hcontents;
}
#endregion
// HtmlContentElement (
)
#region private List<HtmlContentElement> ParseLinksText2(string
html)
private List<HtmlContentElement> ParseLinksText2(string html)
{
List<HtmlContentElement> hconElementOutput = new

List<HtmlContentElement>();
HtmlWeb web = new HtmlWeb();
web.AutoDetectEncoding = true;
HtmlAgilityPack.HtmlDocument doc = web.Load(html); //

Encoding e = doc.Encoding; //
HtmlNodeCollection nodes =
doc.DocumentNode.SelectNodes("//a[@href]");//(xpath); //
(node)
foreach (HtmlNode node in nodes)
{
HtmlContentElement hconElement = new
HtmlContentElement();
hconElement.HtmlLink = node.Attributes["href"].Value;
hconElement.HtmlLinkText = node.InnerText;
hconElementOutput.Add(hconElement);
}
return hconElementOutput;
}
#endregion
// http https
#region
private List<string> CheckValidHTTPLink(List<string>
input)
private List<string> CheckValidHTTPLink(List<string> input)
{
List<string> validLinks = new List<string>();
foreach (string s in input)
{
if (s.Contains("http://"))
{
validLinks.Add(s.Substring(s.IndexOf("http://")));
// http://( http://)
}
else if (s.Contains("https://"))
{
validLinks.Add(s.Substring(s.IndexOf("https://")));
// http://( https://)
}
}
return validLinks;
}
#endregion

// pure http
#region private List<HtmlContentElement>
PureHTTPLink(List<HtmlContentElement> inputLinkList)
private List<HtmlContentElement>
PureHTTPLink(List<HtmlContentElement> inputLinkList)
{
List<HtmlContentElement> outputLinkList = new
List<HtmlContentElement>();
foreach (HtmlContentElement content in inputLinkList)
{
if (content.HtmlLink.Contains("http://"))
{
content.HtmlLink =
content.HtmlLink.Substring(content.HtmlLink.IndexOf("http://")); //
http
outputLinkList.Add(content);
}
else if (content.HtmlLink.Contains("https://"))
{
content.HtmlLink =
content.HtmlLink.Substring(content.HtmlLink.IndexOf("https://")); //
http
outputLinkList.Add(content);
}
}
return outputLinkList;
}
#endregion
// http
#region private List<HtmlContentElement> ParseArticle(string
html,string Xpath)
private List<HtmlContentElement> ParseArticle(string
html,string Xpath)
{
List<HtmlContentElement> hconElementOutput = new
List<HtmlContentElement>();
HtmlWeb web = new HtmlWeb();
web.AutoDetectEncoding = true;
HtmlAgilityPack.HtmlDocument doc = web.Load(html); //

Encoding e = doc.Encoding; //
//Xpath = "//a[@href]"; // (a href a //
a ( href ))
//Xpath = "//div[@class='post-body entry-

content']/a[@href]";
HtmlNodeCollection nodes =
doc.DocumentNode.SelectNodes(Xpath);//(xpath); // (node)

foreach (HtmlNode node in nodes)

{
HtmlContentElement hconElement = new
HtmlContentElement();
hconElement.HtmlLink = node.Attributes["href"].Value;
hconElement.HtmlLinkText = node.InnerText;
hconElementOutput.Add(hconElement);
}
return hconElementOutput;
}
#endregion
// show ListView
#region void ShowListView(List<HtmlContentElement>
contentListDiff)
private void ShowListView(List<HtmlContentElement>
contentListDiff)
{
this.listView1.BeginUpdate();
//UI
EndUpdate
// listView ()
listView1.View = View.Details;
listView1.Columns.Add("");
listView1.Columns.Add("");
listView1.Columns[0].Width = 90; // 0
listView1.Columns[1].Width = 600; // 1
for (int i = 0; i < contentListDiff.Count; i++)
{
ListViewItem itemTemp = new ListViewItem();
itemTemp.Text =
contentListDiff[i].HtmlLinkText;//linkText[i];//"" + i; // 1
itemTemp.SubItems.Add(contentListDiff[i].HtmlLink); //
2
listView1.Items.Add(itemTemp); // listViewItem
ListView
}

this.listView1.EndUpdate();

//UI

}
#endregion
//
#region void ChangeSavePath(string sourcePath, string
TargetPath)
private void ChangeSavePath(string sourcePath, string
TargetPath)
{
byte[] bytes = new byte[0];
using (FileStream fsR = new FileStream(sourcePath,
FileMode.Open, FileAccess.Read))
{
bytes = new byte[fsR.Length];
fsR.Read(bytes, 0, (int)fsR.Length);// bytes
using (FileStream fsW = new FileStream(TargetPath,
FileMode.Create, FileAccess.Write)) //, FileAccess.Write
{
fsW.Write(bytes, 0, (int)fsR.Length);
}
}
System.IO.File.Delete(sourcePath); //
}
#endregion
// pdf
#region public void AddUrl2PDF(string filepath, string url)
public void AddUrl2PDF(string filepath, string url)
{
string oldFile = filepath;//"0.ASP.NET2.0 Excel
.pdf";//"oldFile.pdf";
//string temp = "0.ASP.NET2.0 Excel
.pdf";
string newFile = "NewFile.pdf";//temp;
PdfReader reader = new PdfReader(oldFile);
iTextSharp.text.Rectangle dimension =
reader.GetPageSize(1);
iTextSharp.text.Rectangle size =
reader.GetPageSizeWithRotation(1);
Document document = new Document(size);

// open the writer

FileStream fs = new FileStream(newFile, FileMode.Create,
FileAccess.Write);
PdfWriter writer = PdfWriter.GetInstance(document, fs);
document.Open();
for (var i = 1; i <= reader.NumberOfPages; i++)
{
document.NewPage();
// the pdf content
PdfContentByte cb = writer.DirectContent;
// select the font properties
BaseFont bf = BaseFont.CreateFont(BaseFont.HELVETICA,
BaseFont.CP1252, BaseFont.NOT_EMBEDDED);
cb.SetColorFill(BaseColor.DARK_GRAY);
cb.SetFontAndSize(bf, 8);
// write the text in the pdf content
cb.BeginText();
string text = url;//"https://tw.yahoo.com/"; //

// put the alignment and coordinates here

//cb.ShowTextAligned(1, text, 530, 30, 0); //(1, text,
520, 640, 0);
cb.ShowTextAligned(PdfContentByte.ALIGN_LEFT, text,
dimension.GetLeft(5), dimension.GetTop(8), 0);
cb.EndText();
// create the new page and add it to the pdf
PdfImportedPage page = writer.GetImportedPage(reader,
i);
cb.AddTemplate(page, 0, 0);
}
// close the streams and voil the file should be
changed :)
document.Close();
fs.Close();
writer.Close();
reader.Close();
File.Copy(newFile, oldFile, true); //
File.Delete(newFile); //
}
#endregion

//===================================================
}
}