MyBinder Analytics Data Analysis - MyBinder Analytics Report

This notebook analyzes MyBinder launch data from the automated releases at the following URL:

github.com/jupyterhub/binder-data/releases/tag/latest

def load_binder_data(
    url="https://github.com/jupyterhub/binder-data/releases/download/latest/launches.parquet",
):
    """
    Load MyBinder analytics data from GitHub release.

    Args:
        url: URL to the parquet file in GitHub releases

    Returns:
        DataFrame with binder launch data
    """
    print(f"Loading data from: {url}")

    try:
        df = pd.read_parquet(url)
        print(f"✓ Loaded {len(df):,} records")
        print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
        df["timestamp"] = pd.to_datetime(df["timestamp"])    
        return df
    except Exception as e:
        print(f"✗ Failed to load data: {e}")
        raise  # Re-raise the exception to fail the notebook"

# Load the data
df = load_binder_data()

Loading data from: https://github.com/jupyterhub/binder-data/releases/download/latest/launches.parquet

✓ Loaded 35,316,099 records
Date range: 2018-11-03 00:00:00+00:00 to 2025-08-20 19:06:00+00:00

Launches Over Time by Source¶

Weekly trends of Binder launches grouped by the BinderHub that launched them.

def plot_launches_by_source_over_time(df):
    """
    Create interactive stacked area chart showing number of launches over time by week for each source.
    Excludes the latest week of data which may be incomplete.
    """
    if df.empty or "timestamp" not in df.columns or "origin" not in df.columns:
        print("Missing required columns for time series analysis")
        return

    # Exclude the latest week of data (may be incomplete)
    latest_week_start = df["timestamp"].max() - timedelta(days=7)
    df_filtered = df[df["timestamp"] < latest_week_start]
    
    # Resample by week and count launches by origin
    df_weekly = (
        df_filtered.set_index("timestamp").groupby("origin").resample("W", include_groups=False).size().reset_index()
    )
    df_weekly.columns = ["origin", "week", "launches"]

    # Get top sources for better visualization
    top_sources = df_filtered["origin"].value_counts().head(8).index
    df_weekly_top = df_weekly[df_weekly["origin"].isin(top_sources)]

    fig = px.area(
        df_weekly_top, 
        x="week", 
        y="launches", 
        color="origin",
        title="MyBinder Launches Over Time by Source (Weekly - Stacked)",
        labels={"week": "Date", "launches": "Number of Launches", "origin": "Source"}
    )
    
    fig.update_layout(
        height=600,
        hovermode="x unified",
        legend=dict(orientation="h", yanchor="top", y=-0.1, xanchor="center", x=0.5)
    )
    
    fig.show()

plot_launches_by_source_over_time(df)

Top 25 Repositories (Last 90 Days)¶

The most popular repositories launched on MyBinder in the last 90 days.

def plot_top_repositories_last_90_days(df):
    """
    Create interactive horizontal bar chart of launches in the last 90 days for top 25 repositories.
    Excludes the latest week of data which may be incomplete.
    """
    if df.empty or "timestamp" not in df.columns or "spec" not in df.columns:
        print("Missing required columns for repository analysis")
        return

    # Exclude the latest week of data (may be incomplete)
    latest_week_start = df["timestamp"].max() - timedelta(days=7)
    
    # Filter to last 90 days (excluding latest week)
    cutoff_date = latest_week_start - timedelta(days=90)
    df_recent = df[(df["timestamp"] >= cutoff_date) & (df["timestamp"] < latest_week_start)]
    
    # Pull only the repo so we aren't separating by the ref
    def remove_ref(s):
        parts = s.split("/")
        # Assume it is org/repo/ref
        if len(parts) == 3:
            return "/".join(parts[:2])
        # Assume it's something else
        else:
            return s
            
    df_recent["repo"] = df_recent["spec"].map(remove_ref)

    # Count launches by repository (spec)
    repo_counts = df_recent["repo"].value_counts().head(50)
    
    # Create DataFrame for plotting
    plot_df = pd.DataFrame({
        'repository': repo_counts.index,
        'launches': repo_counts.values
    })
    
    fig = px.bar(
        plot_df,
        x="launches",
        y="repository",
        orientation="h",
        title="Top 25 Repositories by Launches (Last 90 Days, excluding latest week)",
        labels={"launches": "Number of Launches", "repository": "Repository"},
        text="launches"
    )
    
    fig.update_layout(
        height=1600,
        yaxis={'categoryorder': 'total ascending'}
    )
    
    fig.update_traces(textposition='outside')
    
    fig.show()

plot_top_repositories_last_90_days(df)

Provider Distribution (Last 90 Days)¶

Distribution of launches by git provider (GitHub, GitLab, etc.) in the last 90 days.

def plot_provider_distribution_last_90_days(df):
    """
    Create bar chart showing distribution of launches by provider in the last 90 days.
    Excludes the latest week of data which may be incomplete.
    """
    if df.empty or "timestamp" not in df.columns or "provider" not in df.columns:
        print("Missing required columns for provider analysis")
        return

    # Exclude the latest week of data (may be incomplete)
    latest_week_start = df["timestamp"].max() - timedelta(days=7)
    
    # Filter to last 90 days (excluding latest week)
    cutoff_date = latest_week_start - timedelta(days=90)
    df_recent = df[(df["timestamp"] >= cutoff_date) & (df["timestamp"] < latest_week_start)]

    # Count launches by provider
    provider_counts = df_recent["provider"].value_counts()
    
    # Create DataFrame for plotting
    plot_df = pd.DataFrame({
        'provider': provider_counts.index,
        'launches': provider_counts.values
    })
    
    fig = px.bar(
        plot_df,
        x="provider",
        y="launches",
        title="MyBinder Launches by Provider (Last 90 Days, excluding latest week)",
        labels={"launches": "Number of Launches", "provider": "Provider"},
        text="launches"
    )
    
    fig.update_layout(
        height=600,
        xaxis={'categoryorder': 'total descending'}
    )
    
    fig.update_traces(textposition='outside')
    
    fig.show()

plot_provider_distribution_last_90_days(df)

def create_monthly_launches_table(df):
    """
    Create a table showing monthly launches by provider for the last 12 months.
    Excludes the latest week of data which may be incomplete.
    """
    if df.empty or "timestamp" not in df.columns or "provider" not in df.columns:
        print("Missing required columns for monthly analysis")
        return
    
    # Exclude the latest week of data (may be incomplete)
    latest_week_start = df["timestamp"].max() - timedelta(days=7)
    
    # Filter to last 12 months (excluding latest week)
    cutoff_date = latest_week_start - timedelta(days=365)
    df_filtered = df[(df["timestamp"] >= cutoff_date) & (df["timestamp"] < latest_week_start)]
    
    # Create month-year column
    df_filtered = df_filtered.copy()
    df_filtered["month_year"] = df_filtered["timestamp"].dt.to_period("M")
    
    # Group by month and provider, count launches
    monthly_provider_counts = (
        df_filtered.groupby(["month_year", "provider"])
        .size()
        .reset_index(name="launches")
    )
    
    # Pivot to get providers as columns
    pivot_table = monthly_provider_counts.pivot(
        index="month_year", columns="provider", values="launches"
    ).fillna(0).astype(int)
    
    # Add total column
    pivot_table["Total"] = pivot_table.sum(axis=1)
    
    # Sort by month (most recent first)
    pivot_table = pivot_table.sort_index(ascending=False)
    
    # Convert index to string for better display
    pivot_table.index = pivot_table.index.astype(str)
    
    return pivot_table

monthly_table = create_monthly_launches_table(df)
monthly_table.style.set_caption("Monthly launches by repository provider.")

Monthly Launches by Provider (Last 12 Months)¶

Monthly breakdown of launches by provider with totals for the last 12 months.

def create_monthly_launches_table_by_origin(df):
    """
    Create a table showing monthly launches by BinderHub origin for the last 12 months.
    Excludes the latest week of data which may be incomplete.
    """
    if df.empty or "timestamp" not in df.columns or "origin" not in df.columns:
        print("Missing required columns for monthly origin analysis")
        return
    
    # Exclude the latest week of data (may be incomplete)
    latest_week_start = df["timestamp"].max() - timedelta(days=7)
    
    # Filter to last 12 months (excluding latest week)
    cutoff_date = latest_week_start - timedelta(days=365)
    df_filtered = df[(df["timestamp"] >= cutoff_date) & (df["timestamp"] < latest_week_start)]
    
    # Create month-year column
    df_filtered = df_filtered.copy()
    df_filtered["month_year"] = df_filtered["timestamp"].dt.to_period("M")
    
    # Group by month and origin, count launches
    monthly_origin_counts = (
        df_filtered.groupby(["month_year", "origin"])
        .size()
        .reset_index(name="launches")
    )
    
    # Pivot to get origins as columns
    pivot_table = monthly_origin_counts.pivot(
        index="month_year", columns="origin", values="launches"
    ).fillna(0).astype(int)
    
    # Add total column
    pivot_table["Total"] = pivot_table.sum(axis=1)
    
    # Sort by month (most recent first)
    pivot_table = pivot_table.sort_index(ascending=False)
    
    # Convert index to string for better display
    pivot_table.index = pivot_table.index.astype(str)
    
    return pivot_table

monthly_origin_table = create_monthly_launches_table_by_origin(df)
monthly_origin_table.style.set_caption("Monthly launches by BinderHub provider.")