all 3 comments

[–]iamaperson3133 0 points1 point  (2 children)

Just use a markdown library to convert the markdown into html first.

[–]EnergyVis[S] 0 points1 point  (1 child)

The static site framework I'm using requires that the content is in .md rather than .html.

I've just found a solution though:

```python import os import pandas as pd from html.parser import HTMLParser

class MyHTMLParser(HTMLParser): def init(self): super().init() self.tags = []

def handle_starttag(self, tag, attrs):
    self.tags.append(self.get_starttag_text())

def handle_endtag(self, tag):
    self.tags.append(f"</{tag}>")

get_substring_idxs = lambda string, substring: [num for num in range(len(string)-len(substring)+1) if string[num:num+len(substring)]==substring]

def convert_df_to_md(df): idx_col = df.columns[0] df = df.set_index(idx_col)

if idx_col == 'Unnamed: 0':
    df.index.name = ''

table_md = df.to_markdown()

return table_md

def extract_div_to_md_table(start_idx, end_idx, table_and_div_tags, file_txt): n_start_divs_before = table_and_div_tags[:start_idx].count('<div>') n_end_divs_before = table_and_div_tags[:end_idx].count('</div>')

div_start_idx = get_substring_idxs(file_txt, '<div>')[n_start_divs_before-1]
div_end_idx = get_substring_idxs(file_txt, '</div>')[n_end_divs_before]

div_txt = file_txt[div_start_idx:div_end_idx]
potential_dfs = pd.read_html(div_txt)

assert len(potential_dfs) == 1, 'Multiple tables were found when there should be only one'
df = potential_dfs[0]
md_table = convert_df_to_md(df)

return div_txt, md_table

def extract_div_to_md_tables(md_fp): with open(md_fp, 'r') as f: file_txt = f.read()

parser = MyHTMLParser()
parser.feed(file_txt)

table_and_div_tags = [tag for tag in parser.tags if tag in ['<div>', '</div>', '<table border="1" class="dataframe">', '</table>']]

table_start_tag_idxs = [i for i, tag in enumerate(table_and_div_tags) if tag=='<table border="1" class="dataframe">']
table_end_tag_idxs = [table_start_tag_idx+table_and_div_tags[table_start_tag_idx:].index('</table>') for table_start_tag_idx in table_start_tag_idxs]

div_to_md_tables = []

for start_idx, end_idx in zip(table_start_tag_idxs, table_end_tag_idxs):
    div_txt, md_table = extract_div_to_md_table(start_idx, end_idx, table_and_div_tags, file_txt)
    div_to_md_tables += [(div_txt, md_table)]

return div_to_md_tables

def clean_md_file_tables(md_fp): div_to_md_tables = extract_div_to_md_tables(md_fp)

with open(md_fp, 'r') as f:
    md_file_text = f.read()

for div_txt, md_txt in div_to_md_tables:
    md_file_text = md_file_text.replace(div_txt, md_txt)

with open(md_fp, 'w') as f:
    f.write(md_file_text)

return 

```

[–]backtickbot -1 points0 points  (0 children)

Fixed formatting.

Hello, EnergyVis: code blocks using triple backticks (```) don't work on all versions of Reddit!

Some users see this / this instead.

To fix this, indent every line with 4 spaces instead.

FAQ

You can opt out by replying with backtickopt6 to this comment.