Practical7 IR
Practical7 IR
import requests
import me
def get_html(url):
headers={'User-Agent':'Mozilla/5.0(Windows NT 10.0;Win64;x64)AppleWebKit/537.36(KHTML,like
Gecko)Chrome/58.0.3029.110 Safari/537.3'}
try:
response=requests.get(url,headers=headers)
response.raise_for_status()
return response.text
print(f"HTTP Error:{errh}")
return None
def save_robots_txt(url):
try:
robots_url=urljoin(url,'/robots.txt')
robots_content=get_html(robots_url)
if robots_content:
file.write(robots_content.encode('u -8-sig'))
except Excep on as e:
def load_robots_txt():
try:
except FileNotFoundError:
return None
def extract_links(html,base_url):
soup=Beau fulSoup(html,'html.parser')
links=[]
absolute_url=urljoin(base_url,link.get('href'))
links.append(absolute_url)
return links
def is_allowed_by_robots(url,robots_content):
parser=RobotFileParser()
parser.parse(robots_content.split('\n'))
return parser.can_fetch('*',url)
def crawl(start_url,max_depth=3,delay=1):
visited_urls=set()
def recursive_crawl(url,depth,robots_content):
return
visited_urls.add(url)
me.sleep(delay)
html=get_html(url)
if html:
print(f"Crawling {url}")
links=extract_links(html,url)
recursive_crawl(link,depth+1,robots_content)
save_robots_txt(start_url)
robots_content=load_robots_txt()
if not robots_content:
recursive_crawl(start_url,1,robots_content)
print("Performed by Raj")
crawl("h ps://wikipedia.com",max_depth=2,delay=2)
Output: