Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add/st dataframe pagenation #77

Merged
merged 6 commits into from
Aug 30, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 78 additions & 0 deletions src/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,68 @@ def v_space(n: int, col=None) -> None:
else:
st.write("#")

def display_large_dataframe(df,
chunk_sizes: list[int] = [100, 1_000, 10_000]):
"""
Displays a large DataFrame in chunks with pagination controls and row selection.

Args:
df: The DataFrame to display.
chunk_sizes: A list of chunk sizes to choose from.

Returns:
Selected rows from the current chunk.
"""
# Dropdown for selecting chunk size
chunk_size = st.selectbox("Select Number of Rows to Display", chunk_sizes)
singjc marked this conversation as resolved.
Show resolved Hide resolved

# Calculate total number of chunks
total_chunks = (len(df) + chunk_size - 1) // chunk_size

# Initialize session state for pagination
if 'current_chunk' not in st.session_state:
st.session_state.current_chunk = 0

# Function to get the current chunk of the DataFrame
def get_current_chunk(df, chunk_size, chunk_index):
start = chunk_index * chunk_size
end = min(start + chunk_size, len(df)) # Ensure end does not exceed dataframe length
return df.iloc[start:end], start, end

# Display the current chunk
current_chunk_df, start_row, end_row = get_current_chunk(df, chunk_size, st.session_state.current_chunk)

event = st.dataframe(
current_chunk_df,
column_order=[
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be more flexible as the data displayed may vary. I think adding column_order as a function parameter would work best. Is there a reason why selection_mode, on_select, use_container_width, and hide_index have to be set to these specific values? If not I would suggest adding those as parameters as well.

Anything specific to a particular workflow/dataset should not be located in common.py as it is intended to contain functions of general use. As a sidenote, common modules should be moved in a separate directory to make this more clear to developers.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a good point. These are specifically set because I came across this issue when using the pyopenms workflow for selecting spectra to plot when selecting a row, but this should be more flexible for other use cases.

We could pass kwargs to the display_large_dataframe method for st.dataframe because that's probably the only main method that would need to be changed based on particular needs. I opted for this option

"spectrum ID",
"RT",
"MS level",
"max intensity m/z",
"precursor m/z",
],
selection_mode="single-row",
on_select="rerun",
use_container_width=True,
hide_index=True,
)

st.write(f"Showing rows {start_row + 1} to {end_row} of {len(df)} ({get_dataframe_mem_useage(current_chunk_df):.2f} MB)")

# Pagination buttons
col1, col2, col3 = st.columns([1, 2, 1])

with col1:
if st.button("Previous") and st.session_state.current_chunk > 0:
st.session_state.current_chunk -= 1

with col3:
if st.button("Next") and st.session_state.current_chunk < total_chunks - 1:
st.session_state.current_chunk += 1

if event is not None:
return event
return None

def show_table(df: pd.DataFrame, download_name: str = "") -> None:
"""
Expand Down Expand Up @@ -353,6 +415,22 @@ def reset_directory(path: Path) -> None:
shutil.rmtree(path)
path.mkdir(parents=True, exist_ok=True)

def get_dataframe_mem_useage(df):
"""
Get the memory usage of a pandas DataFrame in megabytes.

Args:
df (pd.DataFrame): The DataFrame to calculate the memory usage for.

Returns:
float: The memory usage of the DataFrame in megabytes.
"""
# Calculate the memory usage of the DataFrame in bytes
memory_usage_bytes = df.memory_usage(deep=True).sum()
# Convert bytes to megabytes
memory_usage_mb = memory_usage_bytes / (1024 ** 2)
return memory_usage_mb


# General warning/error messages
WARNINGS = {
Expand Down
18 changes: 2 additions & 16 deletions src/view.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import streamlit as st
import pyopenms as poms
from .plotting.MSExperimentPlotter import plotMSExperiment
from .common import show_fig
from .common import show_fig, display_large_dataframe

from typing import Union

Expand Down Expand Up @@ -216,27 +216,13 @@ def view_peak_map():
peak_map_3D = plotMSExperiment(df, plot3D=True, title="")
st.pyplot(peak_map_3D, use_container_width=True)


@st.experimental_fragment
def view_spectrum():
cols = st.columns([0.34, 0.66])
with cols[0]:
df = st.session_state.view_spectra.copy()
df["spectrum ID"] = df.index + 1
event = st.dataframe(
df,
column_order=[
"spectrum ID",
"RT",
"MS level",
"max intensity m/z",
"precursor m/z",
],
selection_mode="single-row",
on_select="rerun",
use_container_width=True,
hide_index=True,
)
event = display_large_dataframe(df)
rows = event.selection.rows
with cols[1]:
if rows:
Expand Down
Loading