Specifically, XSLT can handle your XPath extractions. Then, from the single transformed result tree, build the needed three data frames. For well-formedness, below assumes the following root and data structure:
<integration-outbound:IntegrationEntity xmlns:integration-outbound="http://example.com" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
...same content...
</integration-outbound:IntegrationEntity>
XSLT (save as .xsl, a special .xml file)
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:integration-outbound="http://example.com" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<xsl:output method="xml" omit-xml-declaration="yes" indent="yes" />
<xsl:strip-space elements="*" />
<xsl:template match="integration-outbound:IntegrationEntity">
<data>
<xsl:apply-templates select="integrationEntityHeader/descendant::attachment" />
<xsl:apply-templates select="integrationEntityDetails/descendant::dataProcessingInfo" />
<xsl:apply-templates select="integrationEntityDetails/descendant::forms/descendant::field" />
</data>
</xsl:template>
<xsl:template match="attachment">
<integrationEntityHeader>
<xsl:copy-of select="ancestor::integrationEntityHeader/*[name()!='attachments']" />
<xsl:copy-of select="*" />
</integrationEntityHeader>
</xsl:template>
<xsl:template match="dataProcessingInfo">
<integrationEntityDetailsControlBlock>
<xsl:copy-of select="ancestor::integration-outbound:IntegrationEntity/integrationEntityHeader/*[position() <= 2]" />
<requestId><xsl:value-of select="ancestor::supplier/requestId" /></requestId>
<supplier_id><xsl:value-of select="ancestor::supplier/id" /></supplier_id>
<xsl:copy-of select="*" />
</integrationEntityDetailsControlBlock>
</xsl:template>
<xsl:template match="field">
<integrationEntityDetailsForms>
<form_id><xsl:value-of select="ancestor::form/id" /></form_id>
<xsl:copy-of select="ancestor::record/*[name()!='fields']" />
<SupplierFormRecordFieldId><xsl:value-of select="id" /></SupplierFormRecordFieldId>
<SupplierFormRecordFieldValue><xsl:value-of select="id" /></SupplierFormRecordFieldValue>
<xsl:copy-of select="ancestor::integration-outbound:IntegrationEntity/integrationEntityHeader/*[position() <= 2]" />
<requestId><xsl:value-of select="ancestor::supplier/requestId" /></requestId>
<supplier_id><xsl:value-of select="ancestor::supplier/id" /></supplier_id>
</integrationEntityDetailsForms>
</xsl:template>
</xsl:stylesheet>
Python
import lxml.etree as et import pandas as pd # LOAD XML AND XSL doc = et.parse('Input.xml') style = et.parse('Script.xsl') # INITIALIZE AND RUN TRANSFORMATION transformer = et.XSLT(style) flat_doc = transformer(doc) # BUILD THREE DATA FRAMES df_header = pd.DataFrame([{ i.tag: i.text for i in el } for el in flat_doc.xpath('integrationEntityHeader') ]) df_detailsControlBlock = pd.DataFrame([{ i.tag: i.text for i in el } for el in flat_doc.xpath('integrationEntityDetailsControlBlock') ]) df_detailsForms = pd.DataFrame([{ i.tag: i.text for i in el } for el in flat_doc.xpath('integrationEntityDetailsForms') ])
Chnage the variable as per u store process_config_csv = 'config.csv' xml_file_name = 'test.xml'
XPATH, ColumName, CSV_File_Name
/
Company / Employee[] / FirstName, FirstName, Name.csv /
Company / Employee[] / LastName, LastName, Name.csv /
Company / Employee[] / ContactNo, ContactNo, Name.csv /
Company / Employee[] / Email, Email, Name.csv /
Company / Employee[] / FirstName, FirstName, Address.csv /
Company / Employee[] / LastName, LastName, Address.csv /
Company / Employee[] / ContactNo, ContactNo, Address.csv /
Company / Employee[] / Email, Email, Address.csv /
Company / Employee[] / Addresses / Address[] / City, City, Address.csv /
Company / Employee[] / Addresses / Address[] / State, State, Address.csv /
Company / Employee[] / Addresses / Address[] / Zip, Zip, Address.csv /
Company / Employee[] / Addresses / Address[] / type, type, Address.csv /
Company / Employee[] / FirstName, FirstName, Form.csv /
Company / Employee[] / LastName, LastName, Form.csv /
Company / Employee[] / ContactNo, ContactNo, Form.csv /
Company / Employee[] / Email, Email, Form.csv /
Company / Employee[] / Addresses / Address[] / type, type, Form.csv /
Company / Employee[] / Addresses / Address[] / forms / form[] / id, id, Form.csv /
Company / Employee[] / Addresses / Address[] / forms / form[] / value, value, Form.csv
I think this line is missing in the question:
df_3['integrationEntityDetails.supplier.forms.form.records.record'] = (
df_3['integrationEntityDetails.supplier.forms.form.records'].apply(
lambda x: x.get('record')
)
)
Then, for the Internalid, you could do this:
df_3['integrationEntityDetails.supplier.forms.form.records.record.Internalid'] = (
df_3['integrationEntityDetails.supplier.forms.form.records.record'].apply(
lambda x: x[0].get('Internalid') if type(x) == list
else x.get('Internalid')
)
)
Databrick's XML library makes XML processing easy.
val headers = spark.read.format("xml").option("rowTag", "integrationEntityHeader").load("stackOverflowRafaXML.xml")
headers.write.csv(<headerFilename>) // Create CSV from the header file
val details = spark.read.format("xml").option("rowTag", "integrationEntityDetails").load("stackOverflowRafaXML.xml")
// The details need further unnesting. To get suppliers, for instance, you can do
val supplier = spark.read.format("xml").option("rowTag", "supplier").load("stackOverflowRafaXML.xml")
supplier.show
+--------------------+--------------------+--------------------+--------------------+--------------------+------------+--------------------+-------+--------------------+---------+------+------------+----------+---------------------+
| allLocations| bankDetails| companyDetails| contactDetails| controlBlock|facilityCode| forms| id| myLocation|requestId|status|supplierType|systemCode|systemFacilityDetails|
+--------------------+--------------------+--------------------+--------------------+--------------------+------------+--------------------+-------+--------------------+---------+------+------------+----------+---------------------+
|[[HQ, 2501 GRANT ...|[[[[LOW_BANK_KEY,...|[No, SUPPLIER, 25...|[[[1704312142, SI...|[[[MODE, Onboardi...| 1|[[[CATEGORY_PRODS...|1647059|[[1704342, false,...| 2614352|ACTIVE| Operational| 1| [[ACTIVE, 1, 1]]|
+--------------------+--------------------+--------------------+--------------------+--------------------+------------+--------------------+-------+--------------------+---------+------+------------+----------+---------------------+
Have you tried pandas_read_xml?
pip install pandas_read_xml
you can do something like
import pandas_read_xml as pdx
df = pdx.read_xml('filename.xml')
To flatten, you could
df = pdx.flatten(df)
The input is an xml which is above and config csv file below. Need to create 3 csv files with corresponding XPATH mentioned in the file,The goal is to have common xml to csv conversion to be put in place. Based on input file the xml should be flattend and exploded into multiple csv and stored.,The config file for this xml is all possible array/multiple level/explode columns should be mentioned as []. The header is needed as referred in the code., 3 days ago In the given code, we will convert JSON files to CSV using Python's inbuilt modules called json and csv. The following code converts the above JSON to CSV file with the keys as headers. First we import both modules. Then read the JSON file and extract the data. Next, we open the CSV file and write the JSON data to the CSV file.
XPATH, ColumName, CSV_File_Name / Company / Employee[] / FirstName, FirstName, Name.csv / Company / Employee[] / LastName, LastName, Name.csv / Company / Employee[] / ContactNo, ContactNo, Name.csv / Company / Employee[] / Email, Email, Name.csv / Company / Employee[] / FirstName, FirstName, Address.csv / Company / Employee[] / LastName, LastName, Address.csv / Company / Employee[] / ContactNo, ContactNo, Address.csv / Company / Employee[] / Email, Email, Address.csv / Company / Employee[] / Addresses / Address[] / City, City, Address.csv / Company / Employee[] / Addresses / Address[] / State, State, Address.csv / Company / Employee[] / Addresses / Address[] / Zip, Zip, Address.csv / Company / Employee[] / Addresses / Address[] / type, type, Address.csv / Company / Employee[] / FirstName, FirstName, Form.csv / Company / Employee[] / LastName, LastName, Form.csv / Company / Employee[] / ContactNo, ContactNo, Form.csv / Company / Employee[] / Email, Email, Form.csv / Company / Employee[] / Addresses / Address[] / type, type, Form.csv / Company / Employee[] / Addresses / Address[] / forms / form[] / id, id, Form.csv / Company / Employee[] / Addresses / Address[] / forms / form[] / value, value, Form.csv
Consider XSLT, the special purpose language designed to transform XML files like flattening them at certain sections. Python's third-party module, lxml, can run XSLT 1.0 scripts and XPath 1.0 expressions.,The xml is converted to dict and then the parsing logic is written , the reason for this is because the same can be used for json . The stackoverflow is amazingly helpful and the solution is build based on the responses from all these links . For simplicity i have created a 3 level nest xml. This works on Python3,Specifically, XSLT can handle your XPath extractions. Then, from the single transformed result tree, build the needed three data frames. For well-formedness, below assumes the following root and data structure:,The config file for this xml is all possible array/multiple level/explode columns should be mentioned as []. The header is needed as referred in the code.
Specifically, XSLT can handle your XPath extractions. Then, from the single transformed result tree, build the needed three data frames. For well-formedness, below assumes the following root and data structure:
<integration-outbound:IntegrationEntity xmlns:integration-outbound="http://example.com" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> ...same content...</integration-outbound:IntegrationEntity>
XSLT (save as .xsl, a special .xml file)
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:integration-outbound="http://example.com" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<xsl:output method="xml" omit-xml-declaration="yes" indent="yes" /> <xsl:strip-space elements="*" />
<xsl:template match="integration-outbound:IntegrationEntity"> <data> <xsl:apply-templates select="integrationEntityHeader/descendant::attachment" /> <xsl:apply-templates select="integrationEntityDetails/descendant::dataProcessingInfo" /> <xsl:apply-templates select="integrationEntityDetails/descendant::forms/descendant::field" /> </data> </xsl:template>
<xsl:template match="attachment">
<integrationEntityHeader> <xsl:copy-of select="ancestor::integrationEntityHeader/*[name()!='attachments']" /> <xsl:copy-of select="*" /> </integrationEntityHeader>
</xsl:template>
<xsl:template match="dataProcessingInfo">
<integrationEntityDetailsControlBlock> <xsl:copy-of select="ancestor::integration-outbound:IntegrationEntity/integrationEntityHeader/*[position() <= 2]" />
<requestId><xsl:value-of select="ancestor::supplier/requestId" /></requestId>
<supplier_id><xsl:value-of select="ancestor::supplier/id" /></supplier_id> <xsl:copy-of select="*" />
</integrationEntityDetailsControlBlock>
</xsl:template>
<xsl:template match="field">
<integrationEntityDetailsForms>
<form_id><xsl:value-of select="ancestor::form/id" /></form_id> <xsl:copy-of select="ancestor::record/*[name()!='fields']" />
<SupplierFormRecordFieldId><xsl:value-of select="id" /></SupplierFormRecordFieldId>
<SupplierFormRecordFieldValue><xsl:value-of select="id" /></SupplierFormRecordFieldValue> <xsl:copy-of select="ancestor::integration-outbound:IntegrationEntity/integrationEntityHeader/*[position() <= 2]" />
<requestId><xsl:value-of select="ancestor::supplier/requestId" /></requestId>
<supplier_id><xsl:value-of select="ancestor::supplier/id" /></supplier_id>
</integrationEntityDetailsForms>
</xsl:template>
</xsl:stylesheet>
Python
import lxml.etree as etimport pandas as pd # LOAD XML AND XSLdoc = et.parse('Input.xml') style = et.parse('Script.xsl') # INITIALIZE AND RUN TRANSFORMATIONtransformer = et.XSLT(style) flat_doc = transformer(doc) # BUILD THREE DATA FRAMESdf_header = pd.DataFrame([{ i.tag: i.text for i in el } for el in flat_doc.xpath('integrationEntityHeader') ]) df_detailsControlBlock = pd.DataFrame([{ i.tag: i.text for i in el } for el in flat_doc.xpath('integrationEntityDetailsControlBlock') ]) df_detailsForms = pd.DataFrame([{ i.tag: i.text for i in el } for el in flat_doc.xpath('integrationEntityDetailsForms') ])
Chnage the variable as per u storeprocess_config_csv = 'config.csv'xml_file_name = 'test.xml'
XPATH, ColumName, CSV_File_Name / Company / Employee[] / FirstName, FirstName, Name.csv / Company / Employee[] / LastName, LastName, Name.csv / Company / Employee[] / ContactNo, ContactNo, Name.csv / Company / Employee[] / Email, Email, Name.csv / Company / Employee[] / FirstName, FirstName, Address.csv / Company / Employee[] / LastName, LastName, Address.csv / Company / Employee[] / ContactNo, ContactNo, Address.csv / Company / Employee[] / Email, Email, Address.csv / Company / Employee[] / Addresses / Address[] / City, City, Address.csv / Company / Employee[] / Addresses / Address[] / State, State, Address.csv / Company / Employee[] / Addresses / Address[] / Zip, Zip, Address.csv / Company / Employee[] / Addresses / Address[] / type, type, Address.csv / Company / Employee[] / FirstName, FirstName, Form.csv / Company / Employee[] / LastName, LastName, Form.csv / Company / Employee[] / ContactNo, ContactNo, Form.csv / Company / Employee[] / Email, Email, Form.csv / Company / Employee[] / Addresses / Address[] / type, type, Form.csv / Company / Employee[] / Addresses / Address[] / forms / form[] / id, id, Form.csv / Company / Employee[] / Addresses / Address[] / forms / form[] / value, value, Form.csv
I think this line is missing in the question:
df_3['integrationEntityDetails.supplier.forms.form.records.record'] = (df_3['integrationEntityDetails.supplier.forms.form.records'].apply(lambda x: x.get('record')))
Then, for the Internalid, you could do this:
df_3['integrationEntityDetails.supplier.forms.form.records.record.Internalid'] = (df_3['integrationEntityDetails.supplier.forms.form.records.record'].apply(lambda x: x[0].get('Internalid') if type(x) == list
else x.get('Internalid')))
We can even use Pandas to convert from CSV to a list of dictionaries with a quick one-liner. Once you have the data formatted as a list of dictionaries, we’ll use the dicttoxml library to convert it to XML format. We’ll also save it to file as a JSON!,JSON provides a clean and easily readable format because it maintains a dictionary-style structure. Just like CSV, Python has a built-in module for JSON that makes reading and writing super easy! When we read in the CSV, it will become a dictionary. We then write that dictionary to file.,Python’s superior flexibility and ease of use are what make it one of the most popular programming language, especially for Data Scientists. A big part of that is how simple it is to work with large datasets.,Python has a steadily developing community that offers enormous help. From amateurs to specialists, there's everybody. There are a lot of instructional exercises, documentation, and guides accessible for Python web development solutions.
Check out the code below. When we run csv.reader()
all of our CSV data becomes accessible. The csvreader.next()
function reads a single line from the CSV; every time you call it, it moves to the next line. We can also loop through every row of the csv using a for-loop as with for row in csvreader
. Make sure that you have the same number of columns in each row, otherwise, you’ll likely end up running into some errors when working with your list of lists
import csv filename = "my_data.csv" fields = [] rows = [] # Reading csv file with open(filename, 'r') as csvfile: # Creating a csv reader object csvreader = csv.reader(csvfile) # Extracting field names in the first row fields = csvreader.next() # Extracting each data row one by one for row in csvreader: rows.append(row) # Printing out the first 5 rows for row in rows[: 5]: print(row)
filename = "my_data.csv" fields = [] rows = [] # Reading csv file with open(filename, 'r') as csvfile: # Creating a csv reader object csvreader = csv.reader(csvfile) # Extracting field names in the first row fields = csvreader.next() # Extracting each data row one by one for row in csvreader: rows.append(row) # Printing out the first 5 rows for row in rows[: 5]: print(row)
Writing to CSV in Python is just as easy. Set up your field names in a single list, and your data in a list of lists. This time we’ll create a writer()
object and use it to write our data to file very similarly to how we did the reading.
import csv # Field names fields = ['Name', 'Goals', 'Assists', 'Shots'] # Rows of data in the csv file rows = [ ['Emily', '12', '18', '112'], ['Katie', '8', '24', '96'], ['John', '16', '9', '101'], ['Mike', '3', '14', '82'] ] filename = "soccer.csv" # Writing to csv file with open(filename, 'w+') as csvfile: # Creating a csv writer object csvwriter = csv.writer(csvfile) # Writing the fields csvwriter.writerow(fields) # Writing the data rows csvwriter.writerows(rows)
Of course, installing the wonderful Pandas library will make working with your data far easier once you’ve read it into a variable. Reading from CSV is a single line as is writing it back to file!
import pandas as pd filename = "my_data.csv" # Read in the data data = pd.read_csv(filename) # Print the first 5 rows print(data.head(5)) # Write the data to file data.to_csv("new_data.csv", sep = ",", index = False)
We can even use Pandas to convert from CSV to a list of dictionaries with a quick one-liner. Once you have the data formatted as a list of dictionaries, we’ll use the dicttoxml
library to convert it to XML format. We’ll also save it to file as a JSON!
import pandas as pd from dicttoxml import dicttoxml import json # Building our dataframe data = { 'Name': ['Emily', 'Katie', 'John', 'Mike'], 'Goals': [12, 8, 16, 3], 'Assists': [18, 24, 9, 14], 'Shots': [112, 96, 101, 82] } df = pd.DataFrame(data, columns = data.keys()) # Converting the dataframe to a dictionary # Then save it to file data_dict = df.to_dict(orient = "records") with open('output.json', "w+") as f: json.dump(data_dict, f, indent = 4) # Converting the dataframe to XML # Then save it to file xml_data = dicttoxml(data_dict).decode() with open("output.xml", "w+") as f: f.write(xml_data)
filename = "my_data.csv" fields = [] rows = [] # Reading csv file with open(filename, 'r') as csvfile: # Creating a csv reader object csvreader = csv.reader(csvfile) # Extracting field names in the first row fields = csvreader.next() # Extracting each data row one by one for row in csvreader: rows.append(row) # Printing out the first 5 rows for row in rows[: 5]: print(row)
# Field names fields = ['Name', 'Goals', 'Assists', 'Shots'] # Rows of data in the csv file rows = [ ['Emily', '12', '18', '112'], ['Katie', '8', '24', '96'], ['John', '16', '9', '101'], ['Mike', '3', '14', '82'] ] filename = "soccer.csv" # Writing to csv file with open(filename, 'w+') as csvfile: # Creating a csv writer object csvwriter = csv.writer(csvfile) # Writing the fields csvwriter.writerow(fields) # Writing the data rows csvwriter.writerows(rows)
filename = "my_data.csv" # Read in the data data = pd.read_csv(filename) # Print the first 5 rows print(data.head(5)) # Write the data to file data.to_csv("new_data.csv", sep = ",", index = False)
# Building our dataframe data = { 'Name': ['Emily', 'Katie', 'John', 'Mike'], 'Goals': [12, 8, 16, 3], 'Assists': [18, 24, 9, 14], 'Shots': [112, 96, 101, 82] } df = pd.DataFrame(data, columns = data.keys()) # Converting the dataframe to a dictionary # Then save it to file data_dict = df.to_dict(orient = "records") with open('output.json', "w+") as f: json.dump(data_dict, f, indent = 4) # Converting the dataframe to XML # Then save it to file xml_data = dicttoxml(data_dict).decode() with open("output.xml", "w+") as f: f.write(xml_data)
# Read the data from file # We now have a Python dictionary with open('data.json') as f: data_listofdict = json.load(f) # We can do the same thing with pandas data_df = pd.read_json('data.json', orient = 'records') # We can write a dictionary to JSON like so # Use 'indent' and 'sort_keys' to make the JSON # file look nice with open('new_data.json', 'w+') as json_file: json.dump(data_listofdict, json_file, indent = 4, sort_keys = True) # And again the same thing with pandas export = data_df.to_json('new_data.json', orient = 'records')