XML To Dataframe
XML To Dataframe
XML To Dataframe
In [135]:
import xml.etree.ElementTree as ET
import os
import pandas as pd
In [129]:
def extract_data(file):
xmlfile = file.split('/')[-1].split('.')[0]
text=''
topics=[]
root = ET.parse(file).getroot()
itemid = root.attrib['itemid']
for child in root:
if child.tag == 'headline':
headline = child.text
if child.tag == 'text':
for i in child.findall('p'):
text += i.text
if child.tag == 'metadata':
for dc in child.findall('dc'):
if dc.attrib['element']=='dc.date.published':
published_date =dc.attrib['value']
for codes in child.findall('codes'):
for code in codes.findall('code'):
topics.append(code.attrib['code'])
return(headline, text, topics, published_date, itemid, xmlfile )
In [195]:
def create_dataframe(directory):
df = pd.DataFrame(columns=['Headline','Text','Bip_topic','Publishe
for sub_directory in os.listdir(directory):
for filename in os.listdir(directory+'/'+sub_directory):
values = extract_data(directory+'/'+sub_directory+'/'+file
keys = df.columns
row = pd.Series(values,index=keys)
df = df.append(row,ignore_index=True)
return df
In [204]:
df = create_dataframe('Data/Data')
localhost:8888/notebooks/Desktop/sai_charan/xml2df.ipynb# 1/6
10/23/2019 xml2df - Jupyter Notebook
In [206]:
df['Bip_topic'].values
Out[206]:
In [207]:
localhost:8888/notebooks/Desktop/sai_charan/xml2df.ipynb# 2/6
10/23/2019 xml2df - Jupyter Notebook
In [213]:
pd.unique(flattened_list)
Out[213]:
localhost:8888/notebooks/Desktop/sai_charan/xml2df.ipynb# 3/6
10/23/2019 xml2df - Jupyter Notebook
'COL', 'I42100',
'E512', 'G158', 'LATV', 'I24300', 'I83800', 'E1
21', 'I3640010',
'I95100', 'NETH', 'I34430', 'MALAY', 'C14', 'NO
RW', 'I25140',
'C41', 'C411', 'I45100', 'I63000', 'I83100', 'G
152', 'ROM',
'I32600', 'ICST', 'I42000', 'BALTST', 'LITH',
'I9741109', 'ARG',
'C16', 'E13', 'E132', 'I35000', 'C34', 'I1610
0', 'I34400', 'BANDH',
'GPRO', 'KENYA', 'I82001', 'I82002', 'I97100',
'GENT', 'WORLD',
'ASIA', 'C313', 'I01002', 'I0100223', 'OMAN',
'I83954', 'BERM',
'I34420', 'I92110', 'I81403', 'I50000', 'YUG',
'I22460', 'BSHZG',
'I83951', 'I66000', 'I8150103', 'I32550', 'I610
00', 'MEX',
'I34200', 'I8150206', 'I47500', 'I64500', 'I010
0138', 'I83600',
'G153', 'I42900', 'I49420', 'I37330', 'I50200',
'I66500', 'I34600',
'C183', 'I34000', 'I34440', 'ABDBI', 'UAE', 'C3
31', 'I34410',
'I77002', 'I0100128', 'I24700', 'I24800', 'I974
1102', 'I65200',
'I37000', 'I1610109', 'I16000', 'E131', 'GABO
N', 'E31', 'E311',
'GSEVEN', 'I82000', 'TANZA', 'SLVAK', 'ALG', 'T
URK', 'ANGOL',
'SAFR', 'UN', 'MOROC', 'I16200', 'I42800', 'I48
300', 'GSCI',
'I0100136', 'I45300', 'I5020002', 'I72101', 'I3
2000', 'VIETN',
'OPEC', 'E14', 'I83200', 'PAPNG', 'I32450', 'I5
0100', 'I8150106',
'I3302019', 'I0100144', 'I8500031', 'I32540',
'I65400', 'E511',
'E513', 'GAMB', 'I42390', 'I47520', 'I66100',
'I49540', 'OECD',
'G157', 'I15000', 'QATAR', 'GFAS', 'I34531', 'I
33030', 'E143',
'GENV', 'I43000', 'I9741105', 'I47510', 'I4910
0', 'BUL', 'IRAN',
'G156', 'LUX', 'CUBA', 'ZAMBIA', 'RWANDA', 'I65
600', 'I64300',
'I64800', 'I16300', 'I0100105', 'I41200', 'BYEL
RS', 'I8500029',
localhost:8888/notebooks/Desktop/sai_charan/xml2df.ipynb# 4/6
10/23/2019 xml2df - Jupyter Notebook
localhost:8888/notebooks/Desktop/sai_charan/xml2df.ipynb# 5/6
10/23/2019 xml2df - Jupyter Notebook
'ECU', 'NICG',
'BAHRN', 'NEPAL', 'I41900', 'LIBER', 'MAURTN',
'I25670',
'I8200318', 'ELSAL', 'HAIT', 'I32810', 'I6620
0', 'I49200', 'GTEN',
'I22470', 'KUWAIT', 'I47101', 'I64700', 'I8480
3', 'I97412',
'I83500', 'REUNI', 'I9741112', 'I25620', 'I7700
3', 'FIJI',
'I37400', 'E61', 'I1300014', 'I24500', 'I4400
0', 'I41230',
'I32300', 'I37300', 'I23000', 'I34350', 'I81501
10', 'BOL', 'MRCSL',
'I98100', 'I3302017', 'I32200', 'MALAG', 'GUA
T', 'SOLIL', 'I64600',
'DUBAI', 'I47200', 'I84801', 'I47530', 'I364002
9', 'I37100',
'BHUTAN', 'I1300002', 'I3640002', 'I83400', 'RA
KH', 'I34700',
'I6560003', 'I22300', 'I5020028', 'I22000', 'I6
540011', 'I48110',
'I6540005', 'I42210', 'CVI'], dtype=object)
In [ ]:
localhost:8888/notebooks/Desktop/sai_charan/xml2df.ipynb# 6/6