0% found this document useful (0 votes)
24 views4 pages

Web Scraping Code

Uploaded by

Priyansh Arya
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
24 views4 pages

Web Scraping Code

Uploaded by

Priyansh Arya
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 4

import numpy as np

import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
import os

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %
(message)s')

# Load the Excel file


input_file_path = "D:\\Stock Market Analysis project\\Web Scraping\\Final Codes\\
ind_nifty100list.xlsx"
df = pd.read_excel(input_file_path)
logging.info("Excel file loaded successfully.")

def get_soup(url):
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
driver.get(url)
time.sleep(5) # Allow time for JavaScript to execute
page_source = driver.page_source
driver.quit()
return BeautifulSoup(page_source, "html.parser")

def extract_general_info(isin):
search_url = f'https://www.investing.com/search/?q={isin}'
logging.info(f"Searching ISIN {isin} at URL: {search_url}")
soup = get_soup(search_url)

try:
link = soup.find(class_="js-inner-all-results-quote-item row").get('href')
stock_url = f'https://www.investing.com{link}'
logging.info(f"Found stock URL: {stock_url}")
stock_soup = get_soup(stock_url)

key_info_section = stock_soup.find('div', {'class': 'border-r-[#e6e9eb]


text-xs leading-4 sm:flex md:border-r md:pr-8 flex-1 sm:h-[23rem] md:mr-8', 'data-
test': 'key-info'})

# List of keywords to find


jargon = ["Open", "Prev. Close", "52 wk Range", "Volume", "EPS", "P/E
Ratio", "Dividend (Yield)"]
general_info = {}

# Loop through each keyword in jargon


for keyword in jargon:
# Iterate through the key_info_section to find the keyword
spans = key_info_section.find_all('span')
for i in range(len(spans)):
text = spans[i].get_text(strip=True) # Get text without extra
whitespace
if text == keyword:
# Special case for "52 wk Range"
if keyword == "52 wk Range":
if i + 1 < len(spans) and i + 6 < len(spans):
low_value = spans[i + 1].get_text(strip=True) # Get
the low value
high_value = spans[i + 6].get_text(strip=True) # Get
the high value
general_info['52 wk Range Low'] = low_value
general_info['52 wk Range High'] = high_value
print(f"Low value for '52 wk Range': {low_value}")
print(f"High value for '52 wk Range': {high_value}")
else:
print("'52 wk Range' does not have enough elements")
else:
# Check if the next element exists for other keywords
if i + 1 < len(spans):
next_value = spans[i + 1].get_text(strip=True) # Get
the next value
general_info[keyword] = next_value
print(f"Next value after '{keyword}':", next_value)
else:
print(f"'{keyword}' is the last element")
break # Stop searching after finding the keyword

# Find the 'li' element with 'a' tag containing the text 'Financials'
financials_link = None
for li in stock_soup.find_all('li', class_='group relative -mb-0.75 cursor-
pointer border-b-3 border-t-3 border-transparent py-3.25 text-base/6 font-semibold
hover:text-[#1256a0]'):
a_tag = li.find('a')
if a_tag and 'Financials' in a_tag.get_text(strip=True):
financials_link = a_tag['href'].strip()
break

if financials_link:
financials_link = f"https://www.investing.com{financials_link}"
print("Financials link:", financials_link)
else:
print("Financials link not found.")

# Find the 'li' element with 'a' tag containing the text 'Technical'
technical_link = None
for li in stock_soup.find_all('li', class_='group relative -mb-0.75 cursor-
pointer border-b-3 border-t-3 border-transparent py-3.25 text-base/6 font-semibold
hover:text-[#1256a0]'):
a_tag = li.find('a')
if a_tag and 'Technical' in a_tag.get_text(strip=True):
technical_link = a_tag['href'].strip()
break

if technical_link:
technical_link = f"https://www.investing.com{technical_link}"
print("Technical link:", technical_link)
else:
print("Technical link not found.")

general_info['Stock URL'] = stock_url


general_info['Financials URL'] = financials_link # Add financials link to
general info
general_info['Technical URL'] = technical_link # Add technical link to
general info
logging.info(f"General info extracted for ISIN: {isin}")
return general_info
except Exception as e:
logging.error(f"Failed to extract general info for ISIN: {isin}, Error:
{e}")
return None

def extract_financial_info(financials_url):
print(f"Extracting financial info from URL: {financials_url}")
financial_soup = get_soup(financials_url)
def extract_latest_value(table, header):
row = table.find('td', string=header)
if row:
data_cell = row.find_next_sibling('td')
return data_cell.text if data_cell else 'N/A'
return 'N/A'

tables = financial_soup.find_all('table', class_='genTbl openTbl


companyFinancialSummaryTbl')
data = {
'Total Revenue': extract_latest_value(tables[0], "Total Revenue"),
'Net Income': extract_latest_value(tables[0], "Net Income"),
'Total Assets': extract_latest_value(tables[1], "Total Assets"),
'Total Liabilities': extract_latest_value(tables[1], "Total Liabilities"),
'Total Equity': extract_latest_value(tables[1], "Total Equity"),
'Cash From Operating Activities': extract_latest_value(tables[2], "Cash
From Operating Activities"),
'Cash From Investing Activities': extract_latest_value(tables[2], "Cash
From Investing Activities"),
'Cash From Financing Activities': extract_latest_value(tables[2], "Cash
From Financing Activities")
}
logging.info(f"Financial info extracted from: {financials_url}")
print(f"Financial info for URL {financials_url}: {data}")
return data

def extract_technical_info(technical_url):
print(f"Extracting technical info from URL: {technical_url}")
technical_soup = get_soup(technical_url)

def extract_ma_value(label):
label_element = technical_soup.find('td', string=label)
if label_element:
value_element = label_element.find_next_sibling('td')
value = value_element.text.split()[0].strip() if value_element else
'N/A'
return value.rstrip('Buy').rstrip('Sell').strip() # Remove 'Buy' or
'Sell' and any extra spaces
return 'N/A'

def extract_indicator_value(label):
label_element = technical_soup.find('td', string=label)
if label_element:
value_element = label_element.find_next_sibling('td')
return value_element.text.strip() if value_element else 'N/A'
return 'N/A'

data = {
'MA50': extract_ma_value('MA50'),
'MA100': extract_ma_value('MA100'),
'MA200': extract_ma_value('MA200'),
'RSI': extract_indicator_value('RSI(14)'),
'MACD': extract_indicator_value('MACD(12,26)')
}
logging.info(f"Technical info extracted from: {technical_url}")
print(f"Technical info for URL {technical_url}: {data}")
return data

def process_stock(row):
isin = row['ISIN Code']
try:
general_info = extract_general_info(isin)
if general_info:
stock_url = general_info.pop('Stock URL')
financials_url = general_info.pop('Financials URL')
technical_url = general_info.pop('Technical URL')
financial_info = extract_financial_info(financials_url) if
financials_url else {}
technical_info = extract_technical_info(technical_url) if technical_url
else {}
all_info = {**row[['Company Name', 'Industry', 'Symbol', 'ISIN
Code']].to_dict(), **general_info, **financial_info, **technical_info}
return all_info
except Exception as e:
logging.error(f"Error processing ISIN {isin}: {e}")
return None

all_stock_data = []

with ThreadPoolExecutor(max_workers=10) as executor:


future_to_row = {executor.submit(process_stock, row): row for _, row in
df.iterrows()}
for future in as_completed(future_to_row):
row = future_to_row[future]
try:
data = future.result()
if data:
all_stock_data.append(data)
except Exception as e:
logging.error(f"Exception for ISIN {row['ISIN Code']}: {e}")

# Convert to DataFrame and save to Excel


all_stock_df = pd.DataFrame(all_stock_data)
output_file = "D:\\Stock Market Analysis project\\Web Scraping\\Final Codes\\Output
File.xlsx"
all_stock_df.to_excel(output_file, index=False)
logging.info("Data saved to Excel successfully.")
print(f"Data saved to Excel file: {output_file}")

You might also like