MyBinder Analytics Data Analysis - MyBinder Analytics Report

This notebook analyzes MyBinder launch data from the automated releases at the following URL:

github.com/jupyterhub/binder-data/releases/tag/latest

Last updated:

'March 01, 2026'

def load_binder_data(
    url="https://github.com/jupyterhub/binder-data/releases/download/latest/launches.parquet",
    local_file="_build/data/launches.parquet",
):
    """
    Load MyBinder analytics data from GitHub release or local file.

    Args:
        url: URL to the parquet file in GitHub releases
        local_file: Path to local parquet file

    Returns:
        DataFrame with binder launch data
    """
    import os
    
    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(local_file), exist_ok=True)
    
    # Check if local file exists
    if os.path.exists(local_file):
        print(f"✓ Found local file: {local_file}")
        print(f"Using cached data (delete {local_file} to redownload)")
        df = pd.read_parquet(local_file)
        print(f"✓ Loaded {len(df):,} records")
        print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
        df["timestamp"] = pd.to_datetime(df["timestamp"])
        return df
    
    # Download from URL
    print(f"Loading data from: {url}")
    max_retries = 3
    for attempt in range(max_retries):
        try:
            df = pd.read_parquet(url)
            print(f"✓ Loaded {len(df):,} records")
            print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
            df["timestamp"] = pd.to_datetime(df["timestamp"])
            
            # Save locally for future use
            df.to_parquet(local_file)
            print(f"✓ Saved to {local_file}")
            
            return df
        except Exception as e:
            if attempt < max_retries - 1:
                print(f"✗ Attempt {attempt + 1} failed: {e}")
                print(f"Retrying ({attempt + 2}/{max_retries})...")
            else:
                print(f"✗ Failed to load data after {max_retries} attempts: {e}")
                raise

# Load the data
df = load_binder_data()

Loading data from: https://github.com/jupyterhub/binder-data/releases/download/latest/launches.parquet

✓ Loaded 36,442,829 records
Date range: 2018-11-03 00:00:00+00:00 to 2026-02-28 22:56:00+00:00

✓ Saved to _build/data/launches.parquet

Launches Over Time by Source¶

Weekly trends of Binder launches grouped by the BinderHub that launched them.

def plot_launches_by_source_over_time(df):
    """
    Create interactive stacked area chart showing number of launches over time by week for each source.
    Shows the last 15 months of data.
    Filters out origins with median monthly launches less than 50.
    """
    if df.empty or "timestamp" not in df.columns or "origin" not in df.columns:
        print("Missing required columns for time series analysis")
        return

    # Find the end of the last complete week (Sunday)
    latest_date = df["timestamp"].max()
    days_since_sunday = (latest_date.weekday() + 1) % 7  # Monday=0, Sunday=6
    last_complete_week_end = latest_date - timedelta(days=days_since_sunday + 7)
    
    # Filter for last 15 months
    fifteen_months_ago = last_complete_week_end - timedelta(days=450)
    df_filtered = df[(df["timestamp"] >= fifteen_months_ago) & (df["timestamp"] <= last_complete_week_end)]
    
    # Calculate median monthly launches per origin over the past 15 months
    df_monthly_filter = df_filtered.copy()
    df_monthly_filter["month_year"] = df_monthly_filter["timestamp"].dt.to_period("M")
    monthly_counts = df_monthly_filter.groupby(["origin", "month_year"]).size().reset_index(name="launches")
    median_monthly = monthly_counts.groupby("origin")["launches"].median()
    
    # Keep only origins with median monthly launches >= 50
    active_origins = median_monthly[median_monthly >= 50].index
    df_filtered = df_filtered[df_filtered["origin"].isin(active_origins)]
    
    # Resample by week and count launches by origin
    df_weekly = (
        df_filtered.set_index("timestamp").groupby("origin").resample("W", include_groups=False).size().reset_index()
    )
    df_weekly.columns = ["origin", "week", "launches"]

    # Get top sources for better visualization
    top_sources = df_filtered["origin"].value_counts().head(8).index
    df_weekly_top = df_weekly[df_weekly["origin"].isin(top_sources)]

    # Calculate "one year ago" from today
    one_year_ago = today - timedelta(days=365)

    fig = px.area(
        df_weekly_top, 
        x="week", 
        y="launches", 
        color="origin",
        title="MyBinder Launches Over Time by Source (Last 15 Months)",
        labels={"week": "Date", "launches": "Number of Launches", "origin": "Source"}
    )
    
    # Add vertical line for "one year ago"
    fig.add_vline(
        x=one_year_ago.timestamp() * 1000,  # Convert to milliseconds
        line_dash="dot",
        line_color="gray",
        line_width=2,
        annotation_text="One year ago",
        annotation_position="top"
    )
    
    fig.update_layout(
        height=600,
        hovermode="x unified",
        legend=dict(orientation="h", yanchor="top", y=-0.1, xanchor="center", x=0.5)
    )
    
    fig.show()

plot_launches_by_source_over_time(df)

Top 25 Repositories (Last 90 Days)¶

The most popular repositories launched on MyBinder in the last 90 days.

def plot_top_repositories_last_90_days(df):
    """
    Create interactive horizontal bar chart of launches in the last 90 days for top 25 repositories.
    Excludes the latest week of data which may be incomplete.
    """
    if df.empty or "timestamp" not in df.columns or "spec" not in df.columns:
        print("Missing required columns for repository analysis")
        return

    # Exclude the latest week of data (may be incomplete)
    latest_week_start = df["timestamp"].max() - timedelta(days=7)
    
    # Filter to last 90 days (excluding latest week)
    cutoff_date = latest_week_start - timedelta(days=90)
    df_recent = df[(df["timestamp"] >= cutoff_date) & (df["timestamp"] < latest_week_start)]
    
    # Pull only the repo so we aren't separating by the ref
    def remove_ref(s):
        parts = s.split("/")
        # Assume it is org/repo/ref
        if len(parts) == 3:
            return "/".join(parts[:2])
        # Assume it's something else
        else:
            return s
            
    df_recent["repo"] = df_recent["spec"].map(remove_ref)

    # Count launches by repository (spec)
    repo_counts = df_recent["repo"].value_counts().head(50)
    
    # Create DataFrame for plotting
    plot_df = pd.DataFrame({
        'repository': repo_counts.index,
        'launches': repo_counts.values
    })
    
    fig = px.bar(
        plot_df,
        x="launches",
        y="repository",
        orientation="h",
        title="Top 25 Repositories by Launches (Last 90 Days, excluding latest week)",
        labels={"launches": "Number of Launches", "repository": "Repository"},
        text="launches"
    )
    
    fig.update_layout(
        height=1600,
        yaxis={'categoryorder': 'total ascending'}
    )
    
    fig.update_traces(textposition='outside')
    
    fig.show()

plot_top_repositories_last_90_days(df)

Provider Distribution (Last 90 Days)¶

Distribution of launches by git provider (GitHub, GitLab, etc.) in the last 90 days.

def plot_provider_distribution_last_90_days(df):
    """
    Create bar chart showing distribution of launches by provider in the last 90 days.
    Excludes the latest week of data which may be incomplete.
    """
    if df.empty or "timestamp" not in df.columns or "provider" not in df.columns:
        print("Missing required columns for provider analysis")
        return

    # Exclude the latest week of data (may be incomplete)
    latest_week_start = df["timestamp"].max() - timedelta(days=7)
    
    # Filter to last 90 days (excluding latest week)
    cutoff_date = latest_week_start - timedelta(days=90)
    df_recent = df[(df["timestamp"] >= cutoff_date) & (df["timestamp"] < latest_week_start)]

    # Count launches by provider
    provider_counts = df_recent["provider"].value_counts()
    
    # Create DataFrame for plotting
    plot_df = pd.DataFrame({
        'provider': provider_counts.index,
        'launches': provider_counts.values
    })
    
    fig = px.bar(
        plot_df,
        x="provider",
        y="launches",
        title="MyBinder Launches by Provider (Last 90 Days, excluding latest week)",
        labels={"launches": "Number of Launches", "provider": "Provider"},
        text="launches"
    )
    
    fig.update_layout(
        height=600,
        xaxis={'categoryorder': 'total descending'}
    )
    
    fig.update_traces(textposition='outside')
    
    fig.show()

plot_provider_distribution_last_90_days(df)

def create_monthly_launches_table(df):
    """
    Create a table showing monthly launches by provider for the last 15 months.
    Includes data up to the current date.
    """
    if df.empty or "timestamp" not in df.columns or "provider" not in df.columns:
        print("Missing required columns for monthly analysis")
        return
    
    # Use the full dataset up to the latest timestamp
    latest_date = df["timestamp"].max()
    
    # Filter to last 15 months
    cutoff_date = latest_date - timedelta(days=450)
    df_filtered = df[df["timestamp"] >= cutoff_date]
    
    # Create month-year column
    df_filtered = df_filtered.copy()
    df_filtered["month_year"] = df_filtered["timestamp"].dt.to_period("M")
    
    # Group by month and provider, count launches
    monthly_provider_counts = (
        df_filtered.groupby(["month_year", "provider"])
        .size()
        .reset_index(name="launches")
    )
    
    # Pivot to get providers as columns
    pivot_table = monthly_provider_counts.pivot(
        index="month_year", columns="provider", values="launches"
    ).fillna(0).astype(int)
    
    # Add total column
    pivot_table["Total"] = pivot_table.sum(axis=1)
    
    # Sort by month (most recent first)
    pivot_table = pivot_table.sort_index(ascending=False)
    
    # Convert index to string for better display
    pivot_table.index = pivot_table.index.astype(str)
    
    return pivot_table

monthly_table = create_monthly_launches_table(df)
monthly_table.style.set_caption("Monthly launches by repository provider.")

Monthly Launches by Provider (Last 15 Months)¶

Monthly breakdown of launches by provider with totals for the last 15 months.

def create_monthly_launches_table_by_origin(df):
    """
    Create a table showing monthly launches by BinderHub origin for the last 15 months.
    Includes data up to the current date.
    Filters out origins with median monthly launches less than 50.
    """
    if df.empty or "timestamp" not in df.columns or "origin" not in df.columns:
        print("Missing required columns for monthly origin analysis")
        return
    
    # Use the full dataset up to the latest timestamp
    latest_date = df["timestamp"].max()
    
    # Filter to last 15 months
    cutoff_date = latest_date - timedelta(days=450)
    df_filtered = df[df["timestamp"] >= cutoff_date]
    
    # Create month-year column
    df_filtered = df_filtered.copy()
    df_filtered["month_year"] = df_filtered["timestamp"].dt.to_period("M")
    
    # Group by month and origin, count launches
    monthly_origin_counts = (
        df_filtered.groupby(["month_year", "origin"])
        .size()
        .reset_index(name="launches")
    )
    
    # Calculate median monthly launches per origin
    median_monthly = monthly_origin_counts.groupby("origin")["launches"].median()
    
    # Keep only origins with median monthly launches >= 50
    active_origins = median_monthly[median_monthly >= 50].index
    monthly_origin_counts = monthly_origin_counts[monthly_origin_counts["origin"].isin(active_origins)]
    
    # Pivot to get origins as columns
    pivot_table = monthly_origin_counts.pivot(
        index="month_year", columns="origin", values="launches"
    ).fillna(0).astype(int)
    
    # Add total column
    pivot_table["Total"] = pivot_table.sum(axis=1)
    
    # Sort by month (most recent first)
    pivot_table = pivot_table.sort_index(ascending=False)
    
    # Convert index to string for better display
    pivot_table.index = pivot_table.index.astype(str)
    
    return pivot_table

monthly_origin_table = create_monthly_launches_table_by_origin(df)
monthly_origin_table.style.set_caption("Monthly launches by BinderHub provider.")