0% found this document useful (0 votes)
16 views

Web Scraping Code

Uploaded by

Priyansh Arya
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
16 views

Web Scraping Code

Uploaded by

Priyansh Arya
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 4

import numpy as np

import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
import os

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %
(message)s')

# Load the Excel file


input_file_path = "D:\\Stock Market Analysis project\\Web Scraping\\Final Codes\\
ind_nifty100list.xlsx"
df = pd.read_excel(input_file_path)
logging.info("Excel file loaded successfully.")

def get_soup(url):
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
driver.get(url)
time.sleep(5) # Allow time for JavaScript to execute
page_source = driver.page_source
driver.quit()
return BeautifulSoup(page_source, "html.parser")

def extract_general_info(isin):
search_url = f'https://www.investing.com/search/?q={isin}'
logging.info(f"Searching ISIN {isin} at URL: {search_url}")
soup = get_soup(search_url)

try:
link = soup.find(class_="js-inner-all-results-quote-item row").get('href')
stock_url = f'https://www.investing.com{link}'
logging.info(f"Found stock URL: {stock_url}")
stock_soup = get_soup(stock_url)

key_info_section = stock_soup.find('div', {'class': 'border-r-[#e6e9eb]


text-xs leading-4 sm:flex md:border-r md:pr-8 flex-1 sm:h-[23rem] md:mr-8', 'data-
test': 'key-info'})

# List of keywords to find


jargon = ["Open", "Prev. Close", "52 wk Range", "Volume", "EPS", "P/E
Ratio", "Dividend (Yield)"]
general_info = {}

# Loop through each keyword in jargon


for keyword in jargon:
# Iterate through the key_info_section to find the keyword
spans = key_info_section.find_all('span')
for i in range(len(spans)):
text = spans[i].get_text(strip=True) # Get text without extra
whitespace
if text == keyword:
# Special case for "52 wk Range"
if keyword == "52 wk Range":
if i + 1 < len(spans) and i + 6 < len(spans):
low_value = spans[i + 1].get_text(strip=True) # Get
the low value
high_value = spans[i + 6].get_text(strip=True) # Get
the high value
general_info['52 wk Range Low'] = low_value
general_info['52 wk Range High'] = high_value
print(f"Low value for '52 wk Range': {low_value}")
print(f"High value for '52 wk Range': {high_value}")
else:
print("'52 wk Range' does not have enough elements")
else:
# Check if the next element exists for other keywords
if i + 1 < len(spans):
next_value = spans[i + 1].get_text(strip=True) # Get
the next value
general_info[keyword] = next_value
print(f"Next value after '{keyword}':", next_value)
else:
print(f"'{keyword}' is the last element")
break # Stop searching after finding the keyword

# Find the 'li' element with 'a' tag containing the text 'Financials'
financials_link = None
for li in stock_soup.find_all('li', class_='group relative -mb-0.75 cursor-
pointer border-b-3 border-t-3 border-transparent py-3.25 text-base/6 font-semibold
hover:text-[#1256a0]'):
a_tag = li.find('a')
if a_tag and 'Financials' in a_tag.get_text(strip=True):
financials_link = a_tag['href'].strip()
break

if financials_link:
financials_link = f"https://www.investing.com{financials_link}"
print("Financials link:", financials_link)
else:
print("Financials link not found.")

# Find the 'li' element with 'a' tag containing the text 'Technical'
technical_link = None
for li in stock_soup.find_all('li', class_='group relative -mb-0.75 cursor-
pointer border-b-3 border-t-3 border-transparent py-3.25 text-base/6 font-semibold
hover:text-[#1256a0]'):
a_tag = li.find('a')
if a_tag and 'Technical' in a_tag.get_text(strip=True):
technical_link = a_tag['href'].strip()
break

if technical_link:
technical_link = f"https://www.investing.com{technical_link}"
print("Technical link:", technical_link)
else:
print("Technical link not found.")

general_info['Stock URL'] = stock_url


general_info['Financials URL'] = financials_link # Add financials link to
general info
general_info['Technical URL'] = technical_link # Add technical link to
general info
logging.info(f"General info extracted for ISIN: {isin}")
return general_info
except Exception as e:
logging.error(f"Failed to extract general info for ISIN: {isin}, Error:
{e}")
return None

def extract_financial_info(financials_url):
print(f"Extracting financial info from URL: {financials_url}")
financial_soup = get_soup(financials_url)
def extract_latest_value(table, header):
row = table.find('td', string=header)
if row:
data_cell = row.find_next_sibling('td')
return data_cell.text if data_cell else 'N/A'
return 'N/A'

tables = financial_soup.find_all('table', class_='genTbl openTbl


companyFinancialSummaryTbl')
data = {
'Total Revenue': extract_latest_value(tables[0], "Total Revenue"),
'Net Income': extract_latest_value(tables[0], "Net Income"),
'Total Assets': extract_latest_value(tables[1], "Total Assets"),
'Total Liabilities': extract_latest_value(tables[1], "Total Liabilities"),
'Total Equity': extract_latest_value(tables[1], "Total Equity"),
'Cash From Operating Activities': extract_latest_value(tables[2], "Cash
From Operating Activities"),
'Cash From Investing Activities': extract_latest_value(tables[2], "Cash
From Investing Activities"),
'Cash From Financing Activities': extract_latest_value(tables[2], "Cash
From Financing Activities")
}
logging.info(f"Financial info extracted from: {financials_url}")
print(f"Financial info for URL {financials_url}: {data}")
return data

def extract_technical_info(technical_url):
print(f"Extracting technical info from URL: {technical_url}")
technical_soup = get_soup(technical_url)

def extract_ma_value(label):
label_element = technical_soup.find('td', string=label)
if label_element:
value_element = label_element.find_next_sibling('td')
value = value_element.text.split()[0].strip() if value_element else
'N/A'
return value.rstrip('Buy').rstrip('Sell').strip() # Remove 'Buy' or
'Sell' and any extra spaces
return 'N/A'

def extract_indicator_value(label):
label_element = technical_soup.find('td', string=label)
if label_element:
value_element = label_element.find_next_sibling('td')
return value_element.text.strip() if value_element else 'N/A'
return 'N/A'

data = {
'MA50': extract_ma_value('MA50'),
'MA100': extract_ma_value('MA100'),
'MA200': extract_ma_value('MA200'),
'RSI': extract_indicator_value('RSI(14)'),
'MACD': extract_indicator_value('MACD(12,26)')
}
logging.info(f"Technical info extracted from: {technical_url}")
print(f"Technical info for URL {technical_url}: {data}")
return data

def process_stock(row):
isin = row['ISIN Code']
try:
general_info = extract_general_info(isin)
if general_info:
stock_url = general_info.pop('Stock URL')
financials_url = general_info.pop('Financials URL')
technical_url = general_info.pop('Technical URL')
financial_info = extract_financial_info(financials_url) if
financials_url else {}
technical_info = extract_technical_info(technical_url) if technical_url
else {}
all_info = {**row[['Company Name', 'Industry', 'Symbol', 'ISIN
Code']].to_dict(), **general_info, **financial_info, **technical_info}
return all_info
except Exception as e:
logging.error(f"Error processing ISIN {isin}: {e}")
return None

all_stock_data = []

with ThreadPoolExecutor(max_workers=10) as executor:


future_to_row = {executor.submit(process_stock, row): row for _, row in
df.iterrows()}
for future in as_completed(future_to_row):
row = future_to_row[future]
try:
data = future.result()
if data:
all_stock_data.append(data)
except Exception as e:
logging.error(f"Exception for ISIN {row['ISIN Code']}: {e}")

# Convert to DataFrame and save to Excel


all_stock_df = pd.DataFrame(all_stock_data)
output_file = "D:\\Stock Market Analysis project\\Web Scraping\\Final Codes\\Output
File.xlsx"
all_stock_df.to_excel(output_file, index=False)
logging.info("Data saved to Excel successfully.")
print(f"Data saved to Excel file: {output_file}")

You might also like