Web Scraping Code
Web Scraping Code
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
import os
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %
(message)s')
def get_soup(url):
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
driver.get(url)
time.sleep(5) # Allow time for JavaScript to execute
page_source = driver.page_source
driver.quit()
return BeautifulSoup(page_source, "html.parser")
def extract_general_info(isin):
search_url = f'https://www.investing.com/search/?q={isin}'
logging.info(f"Searching ISIN {isin} at URL: {search_url}")
soup = get_soup(search_url)
try:
link = soup.find(class_="js-inner-all-results-quote-item row").get('href')
stock_url = f'https://www.investing.com{link}'
logging.info(f"Found stock URL: {stock_url}")
stock_soup = get_soup(stock_url)
# Find the 'li' element with 'a' tag containing the text 'Financials'
financials_link = None
for li in stock_soup.find_all('li', class_='group relative -mb-0.75 cursor-
pointer border-b-3 border-t-3 border-transparent py-3.25 text-base/6 font-semibold
hover:text-[#1256a0]'):
a_tag = li.find('a')
if a_tag and 'Financials' in a_tag.get_text(strip=True):
financials_link = a_tag['href'].strip()
break
if financials_link:
financials_link = f"https://www.investing.com{financials_link}"
print("Financials link:", financials_link)
else:
print("Financials link not found.")
# Find the 'li' element with 'a' tag containing the text 'Technical'
technical_link = None
for li in stock_soup.find_all('li', class_='group relative -mb-0.75 cursor-
pointer border-b-3 border-t-3 border-transparent py-3.25 text-base/6 font-semibold
hover:text-[#1256a0]'):
a_tag = li.find('a')
if a_tag and 'Technical' in a_tag.get_text(strip=True):
technical_link = a_tag['href'].strip()
break
if technical_link:
technical_link = f"https://www.investing.com{technical_link}"
print("Technical link:", technical_link)
else:
print("Technical link not found.")
def extract_financial_info(financials_url):
print(f"Extracting financial info from URL: {financials_url}")
financial_soup = get_soup(financials_url)
def extract_latest_value(table, header):
row = table.find('td', string=header)
if row:
data_cell = row.find_next_sibling('td')
return data_cell.text if data_cell else 'N/A'
return 'N/A'
def extract_technical_info(technical_url):
print(f"Extracting technical info from URL: {technical_url}")
technical_soup = get_soup(technical_url)
def extract_ma_value(label):
label_element = technical_soup.find('td', string=label)
if label_element:
value_element = label_element.find_next_sibling('td')
value = value_element.text.split()[0].strip() if value_element else
'N/A'
return value.rstrip('Buy').rstrip('Sell').strip() # Remove 'Buy' or
'Sell' and any extra spaces
return 'N/A'
def extract_indicator_value(label):
label_element = technical_soup.find('td', string=label)
if label_element:
value_element = label_element.find_next_sibling('td')
return value_element.text.strip() if value_element else 'N/A'
return 'N/A'
data = {
'MA50': extract_ma_value('MA50'),
'MA100': extract_ma_value('MA100'),
'MA200': extract_ma_value('MA200'),
'RSI': extract_indicator_value('RSI(14)'),
'MACD': extract_indicator_value('MACD(12,26)')
}
logging.info(f"Technical info extracted from: {technical_url}")
print(f"Technical info for URL {technical_url}: {data}")
return data
def process_stock(row):
isin = row['ISIN Code']
try:
general_info = extract_general_info(isin)
if general_info:
stock_url = general_info.pop('Stock URL')
financials_url = general_info.pop('Financials URL')
technical_url = general_info.pop('Technical URL')
financial_info = extract_financial_info(financials_url) if
financials_url else {}
technical_info = extract_technical_info(technical_url) if technical_url
else {}
all_info = {**row[['Company Name', 'Industry', 'Symbol', 'ISIN
Code']].to_dict(), **general_info, **financial_info, **technical_info}
return all_info
except Exception as e:
logging.error(f"Error processing ISIN {isin}: {e}")
return None
all_stock_data = []