aquibmoin commited on
Commit
0bdccf9
·
verified ·
1 Parent(s): 4b8acc1

Create extract_table.py

Browse files
Files changed (1) hide show
  1. utils/extract_table.py +51 -0
utils/extract_table.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import pandas as pd
3
+
4
+ def extract_table_from_response(gpt_response):
5
+ # Split the response into lines
6
+ lines = gpt_response.strip().split("\n")
7
+
8
+ # Find where the table starts and ends (based on the presence of pipes `|` and at least 3 columns)
9
+ table_lines = [line for line in lines if '|' in line and len(line.split('|')) > 3]
10
+
11
+ # If no table is found, return None or an empty string
12
+ if not table_lines:
13
+ return None
14
+
15
+ # Find the first and last index of the table lines
16
+ first_table_index = lines.index(table_lines[0])
17
+ last_table_index = lines.index(table_lines[-1])
18
+
19
+ # Extract only the table part
20
+ table_text = lines[first_table_index:last_table_index + 1]
21
+
22
+ return table_text
23
+
24
+ def gpt_response_to_dataframe(gpt_response):
25
+ # Extract the table text from the GPT response
26
+ table_lines = extract_table_from_response(gpt_response)
27
+
28
+ # If no table found, return an empty DataFrame
29
+ if table_lines is None or len(table_lines) == 0:
30
+ return pd.DataFrame()
31
+
32
+ # Find the header and row separator (assume it's a line with dashes like |---|)
33
+ try:
34
+ # The separator line (contains dashes separating headers and rows)
35
+ sep_line_index = next(i for i, line in enumerate(table_lines) if set(line.strip()) == {'|', '-'})
36
+ except StopIteration:
37
+ # If no separator line is found, return an empty DataFrame
38
+ return pd.DataFrame()
39
+
40
+ # Extract headers (the line before the separator) and rows (lines after the separator)
41
+ headers = [h.strip() for h in table_lines[sep_line_index - 1].split('|')[1:-1]]
42
+
43
+ # Extract rows (each line after the separator)
44
+ rows = [
45
+ [cell.strip() for cell in row.split('|')[1:-1]]
46
+ for row in table_lines[sep_line_index + 1:]
47
+ ]
48
+
49
+ # Create DataFrame
50
+ df = pd.DataFrame(rows, columns=headers)
51
+ return df