Spaces:
Running
Running
from functools import lru_cache | |
import duckdb | |
import gradio as gr | |
import pandas as pd | |
import requests | |
from duckdb import DuckDBPyRelation | |
from duckdb.typing import DuckDBPyType | |
from huggingface_hub import HfApi | |
Table = DuckDBPyRelation | |
Dtype = DuckDBPyType | |
READ_PARQUET_FUNCTIONS = ("dd.read_parquet", "pd.read_parquet") | |
EMPTY_TABLE = duckdb.sql("SELECT null as col_1, null as col_2, null as col_3, null as col_4 FROM range(10)") | |
PAGE_SIZE = 100 | |
NUM_TRENDING_DATASETS = 10 | |
NUM_USER_DATASETS = 10 | |
css = """ | |
.transparent-dropdown, .transparent-dropdown .container .wrap, .transparent-accordion { | |
background: var(--body-background-fill); | |
} | |
.gradio-container { | |
padding: var(--size-4) 0 !important; | |
max-width: 98% !important; | |
} | |
""" | |
def cached_duckdb_sql(query: str) -> Table: | |
return duckdb.sql(query) | |
def to_json_df(tbl: Table) -> pd.DataFrame: | |
query = ", ".join("nullif(([" + col + "]::JSON)[0]::VARCHAR, 'null') as " + col for col in tbl.columns) | |
return duckdb.sql(f"SELECT {query} FROM tbl").df() | |
def from_json_df(df: pd.DataFrame, dtypes: list[Dtype]) -> Table: | |
query = ", ".join("(ifnull(" + col + ", 'null')::JSON)::" + dtype + " as " + col for col, dtype in zip(df.columns, dtypes)) | |
return duckdb.sql(f"SELECT {query} FROM df") | |
with gr.Blocks(css=css) as demo: | |
loading_codes_json = gr.JSON(visible=False) | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("# <p style='text-align:center;'>π€ (WIP) Hugging Face Dataset Spreadsheets π</p>\n\n<p style='text-align:center;'>Edit any dataset on Hugging Face (full list <a href='https://huggingface.co/datasets' target='_blank'>here</a>)") | |
with gr.Group(): | |
with gr.Row(): | |
dataset_dropdown = gr.Dropdown(label="Dataset", allow_custom_value=True, scale=10) | |
subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False, elem_classes="transparent-dropdown") | |
split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False, elem_classes="transparent-dropdown") | |
gr.LoginButton() | |
dataframe = gr.DataFrame(to_json_df(EMPTY_TABLE), interactive=True, wrap=True) | |
def show_subset_dropdown(dataset: str): | |
if dataset and "/" not in dataset.strip().strip("/"): | |
return [] | |
resp = requests.get(f"https://datasets-server.huggingface.co/compatible-libraries?dataset={dataset}", timeout=3).json() | |
loading_codes = ([lib["loading_codes"] for lib in resp.get("libraries", []) if lib["function"] in READ_PARQUET_FUNCTIONS] or [[]])[0] or [] | |
subsets = [loading_code["config_name"] for loading_code in loading_codes] | |
subset = (subsets or [""])[0] | |
return dict(choices=subsets, value=subset, visible=len(subsets) > 1, key=hash(str(loading_codes))), loading_codes | |
def show_split_dropdown(subset: str, loading_codes: list[dict]): | |
splits = ([list(loading_code["arguments"]["splits"]) for loading_code in loading_codes if loading_code["config_name"] == subset] or [[]])[0] | |
split = (splits or [""])[0] | |
return dict(choices=splits, value=split, visible=len(splits) > 1, key=hash(str(loading_codes) + subset)) | |
def show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]): | |
pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0] | |
if dataset and subset and split and pattern: | |
tbl = cached_duckdb_sql(f"SELECT * FROM 'hf://datasets/{dataset}/{pattern}' LIMIT {PAGE_SIZE}") | |
else: | |
tbl = EMPTY_TABLE | |
return dict(value=to_json_df(tbl)) | |
def _fetch_datasets(request: gr.Request, oauth_token: gr.OAuthToken | None): | |
api = HfApi(token=oauth_token.token if oauth_token else None) | |
datasets = list(api.list_datasets(limit=NUM_TRENDING_DATASETS, sort="trendingScore", direction=-1, filter=["format:parquet"])) | |
if oauth_token and (user := api.whoami().get("name")): | |
datasets += list(api.list_datasets(limit=NUM_USER_DATASETS, sort="trendingScore", direction=-1, filter=["format:parquet"], author=user)) | |
dataset = request.query_params.get("dataset") or datasets[0].id | |
subsets, loading_codes = show_subset_dropdown(dataset) | |
splits = show_split_dropdown(subsets["value"], loading_codes) | |
input_dataframe = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes) | |
return { | |
dataset_dropdown: gr.Dropdown(choices=[dataset.id for dataset in datasets], value=dataset), | |
loading_codes_json: loading_codes, | |
subset_dropdown: gr.Dropdown(**subsets), | |
split_dropdown: gr.Dropdown(**splits), | |
dataframe: gr.DataFrame(**input_dataframe), | |
} | |
def _show_subset_dropdown(dataset: str): | |
subsets, loading_codes = show_subset_dropdown(dataset) | |
splits = show_split_dropdown(subsets["value"], loading_codes) | |
input_dataframe = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes) | |
return { | |
loading_codes_json: loading_codes, | |
subset_dropdown: gr.Dropdown(**subsets), | |
split_dropdown: gr.Dropdown(**splits), | |
dataframe: gr.DataFrame(**input_dataframe), | |
} | |
def _show_split_dropdown(dataset: str, subset: str, loading_codes: list[dict]): | |
splits = show_split_dropdown(subset, loading_codes) | |
input_dataframe = show_input_dataframe(dataset, subset, splits["value"], loading_codes) | |
return { | |
split_dropdown: gr.Dropdown(**splits), | |
dataframe: gr.DataFrame(**input_dataframe), | |
} | |
def _show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame: | |
input_dataframe = show_input_dataframe(dataset, subset, split, loading_codes) | |
return { | |
dataframe: gr.DataFrame(**input_dataframe), | |
} | |
if __name__ == "__main__": | |
demo.launch() | |