This notebook analyzes MyBinder launch data from the automated releases at the following URL:
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
today = datetime.now()
f"{today:%B %d, %Y}"
'October 14, 2025'
Last updated: 'October 14, 2025'
def load_binder_data(
url="https://github.com/jupyterhub/binder-data/releases/download/latest/launches.parquet",
local_file="_build/data/launches.parquet",
):
"""
Load MyBinder analytics data from GitHub release or local file.
Args:
url: URL to the parquet file in GitHub releases
local_file: Path to local parquet file
Returns:
DataFrame with binder launch data
"""
import os
# Create directory if it doesn't exist
os.makedirs(os.path.dirname(local_file), exist_ok=True)
# Check if local file exists
if os.path.exists(local_file):
print(f"✓ Found local file: {local_file}")
print(f"Using cached data (delete {local_file} to redownload)")
df = pd.read_parquet(local_file)
print(f"✓ Loaded {len(df):,} records")
print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
df["timestamp"] = pd.to_datetime(df["timestamp"])
return df
# Download from URL
print(f"Loading data from: {url}")
max_retries = 3
for attempt in range(max_retries):
try:
df = pd.read_parquet(url)
print(f"✓ Loaded {len(df):,} records")
print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
df["timestamp"] = pd.to_datetime(df["timestamp"])
# Save locally for future use
df.to_parquet(local_file)
print(f"✓ Saved to {local_file}")
return df
except Exception as e:
if attempt < max_retries - 1:
print(f"✗ Attempt {attempt + 1} failed: {e}")
print(f"Retrying ({attempt + 2}/{max_retries})...")
else:
print(f"✗ Failed to load data after {max_retries} attempts: {e}")
raise
# Load the data
df = load_binder_data()
Loading data from: https://github.com/jupyterhub/binder-data/releases/download/latest/launches.parquet
✓ Loaded 35,753,858 records
Date range: 2018-11-03 00:00:00+00:00 to 2025-10-14 20:09:00+00:00
✓ Saved to _build/data/launches.parquet
Launches Over Time by Source¶
Weekly trends of Binder launches grouped by the BinderHub that launched them.
def plot_launches_by_source_over_time(df):
"""
Create interactive stacked area chart showing number of launches over time by week for each source.
Shows the last 15 months of data.
Filters out origins with median monthly launches less than 50.
"""
if df.empty or "timestamp" not in df.columns or "origin" not in df.columns:
print("Missing required columns for time series analysis")
return
# Find the end of the last complete week (Sunday)
latest_date = df["timestamp"].max()
days_since_sunday = (latest_date.weekday() + 1) % 7 # Monday=0, Sunday=6
last_complete_week_end = latest_date - timedelta(days=days_since_sunday + 7)
# Filter for last 15 months
fifteen_months_ago = last_complete_week_end - timedelta(days=450)
df_filtered = df[(df["timestamp"] >= fifteen_months_ago) & (df["timestamp"] <= last_complete_week_end)]
# Calculate median monthly launches per origin over the past 15 months
df_monthly_filter = df_filtered.copy()
df_monthly_filter["month_year"] = df_monthly_filter["timestamp"].dt.to_period("M")
monthly_counts = df_monthly_filter.groupby(["origin", "month_year"]).size().reset_index(name="launches")
median_monthly = monthly_counts.groupby("origin")["launches"].median()
# Keep only origins with median monthly launches >= 50
active_origins = median_monthly[median_monthly >= 50].index
df_filtered = df_filtered[df_filtered["origin"].isin(active_origins)]
# Resample by week and count launches by origin
df_weekly = (
df_filtered.set_index("timestamp").groupby("origin").resample("W", include_groups=False).size().reset_index()
)
df_weekly.columns = ["origin", "week", "launches"]
# Get top sources for better visualization
top_sources = df_filtered["origin"].value_counts().head(8).index
df_weekly_top = df_weekly[df_weekly["origin"].isin(top_sources)]
# Calculate "one year ago" from today
one_year_ago = today - timedelta(days=365)
fig = px.area(
df_weekly_top,
x="week",
y="launches",
color="origin",
title="MyBinder Launches Over Time by Source (Last 15 Months)",
labels={"week": "Date", "launches": "Number of Launches", "origin": "Source"}
)
# Add vertical line for "one year ago"
fig.add_vline(
x=one_year_ago.timestamp() * 1000, # Convert to milliseconds
line_dash="dot",
line_color="gray",
line_width=2,
annotation_text="One year ago",
annotation_position="top"
)
fig.update_layout(
height=600,
hovermode="x unified",
legend=dict(orientation="h", yanchor="top", y=-0.1, xanchor="center", x=0.5)
)
fig.show()
plot_launches_by_source_over_time(df)
Loading...
Top 25 Repositories (Last 90 Days)¶
The most popular repositories launched on MyBinder in the last 90 days.
def plot_top_repositories_last_90_days(df):
"""
Create interactive horizontal bar chart of launches in the last 90 days for top 25 repositories.
Excludes the latest week of data which may be incomplete.
"""
if df.empty or "timestamp" not in df.columns or "spec" not in df.columns:
print("Missing required columns for repository analysis")
return
# Exclude the latest week of data (may be incomplete)
latest_week_start = df["timestamp"].max() - timedelta(days=7)
# Filter to last 90 days (excluding latest week)
cutoff_date = latest_week_start - timedelta(days=90)
df_recent = df[(df["timestamp"] >= cutoff_date) & (df["timestamp"] < latest_week_start)]
# Pull only the repo so we aren't separating by the ref
def remove_ref(s):
parts = s.split("/")
# Assume it is org/repo/ref
if len(parts) == 3:
return "/".join(parts[:2])
# Assume it's something else
else:
return s
df_recent["repo"] = df_recent["spec"].map(remove_ref)
# Count launches by repository (spec)
repo_counts = df_recent["repo"].value_counts().head(50)
# Create DataFrame for plotting
plot_df = pd.DataFrame({
'repository': repo_counts.index,
'launches': repo_counts.values
})
fig = px.bar(
plot_df,
x="launches",
y="repository",
orientation="h",
title="Top 25 Repositories by Launches (Last 90 Days, excluding latest week)",
labels={"launches": "Number of Launches", "repository": "Repository"},
text="launches"
)
fig.update_layout(
height=1600,
yaxis={'categoryorder': 'total ascending'}
)
fig.update_traces(textposition='outside')
fig.show()
plot_top_repositories_last_90_days(df)
Loading...
Provider Distribution (Last 90 Days)¶
Distribution of launches by git provider (GitHub, GitLab, etc.) in the last 90 days.
def plot_provider_distribution_last_90_days(df):
"""
Create bar chart showing distribution of launches by provider in the last 90 days.
Excludes the latest week of data which may be incomplete.
"""
if df.empty or "timestamp" not in df.columns or "provider" not in df.columns:
print("Missing required columns for provider analysis")
return
# Exclude the latest week of data (may be incomplete)
latest_week_start = df["timestamp"].max() - timedelta(days=7)
# Filter to last 90 days (excluding latest week)
cutoff_date = latest_week_start - timedelta(days=90)
df_recent = df[(df["timestamp"] >= cutoff_date) & (df["timestamp"] < latest_week_start)]
# Count launches by provider
provider_counts = df_recent["provider"].value_counts()
# Create DataFrame for plotting
plot_df = pd.DataFrame({
'provider': provider_counts.index,
'launches': provider_counts.values
})
fig = px.bar(
plot_df,
x="provider",
y="launches",
title="MyBinder Launches by Provider (Last 90 Days, excluding latest week)",
labels={"launches": "Number of Launches", "provider": "Provider"},
text="launches"
)
fig.update_layout(
height=600,
xaxis={'categoryorder': 'total descending'}
)
fig.update_traces(textposition='outside')
fig.show()
plot_provider_distribution_last_90_days(df)
Loading...
def create_monthly_launches_table(df):
"""
Create a table showing monthly launches by provider for the last 15 months.
Includes data up to the current date.
"""
if df.empty or "timestamp" not in df.columns or "provider" not in df.columns:
print("Missing required columns for monthly analysis")
return
# Use the full dataset up to the latest timestamp
latest_date = df["timestamp"].max()
# Filter to last 15 months
cutoff_date = latest_date - timedelta(days=450)
df_filtered = df[df["timestamp"] >= cutoff_date]
# Create month-year column
df_filtered = df_filtered.copy()
df_filtered["month_year"] = df_filtered["timestamp"].dt.to_period("M")
# Group by month and provider, count launches
monthly_provider_counts = (
df_filtered.groupby(["month_year", "provider"])
.size()
.reset_index(name="launches")
)
# Pivot to get providers as columns
pivot_table = monthly_provider_counts.pivot(
index="month_year", columns="provider", values="launches"
).fillna(0).astype(int)
# Add total column
pivot_table["Total"] = pivot_table.sum(axis=1)
# Sort by month (most recent first)
pivot_table = pivot_table.sort_index(ascending=False)
# Convert index to string for better display
pivot_table.index = pivot_table.index.astype(str)
return pivot_table
monthly_table = create_monthly_launches_table(df)
monthly_table.style.set_caption("Monthly launches by repository provider.")
Loading...
Monthly Launches by Provider (Last 15 Months)¶
Monthly breakdown of launches by provider with totals for the last 15 months.
def create_monthly_launches_table_by_origin(df):
"""
Create a table showing monthly launches by BinderHub origin for the last 15 months.
Includes data up to the current date.
Filters out origins with median monthly launches less than 50.
"""
if df.empty or "timestamp" not in df.columns or "origin" not in df.columns:
print("Missing required columns for monthly origin analysis")
return
# Use the full dataset up to the latest timestamp
latest_date = df["timestamp"].max()
# Filter to last 15 months
cutoff_date = latest_date - timedelta(days=450)
df_filtered = df[df["timestamp"] >= cutoff_date]
# Create month-year column
df_filtered = df_filtered.copy()
df_filtered["month_year"] = df_filtered["timestamp"].dt.to_period("M")
# Group by month and origin, count launches
monthly_origin_counts = (
df_filtered.groupby(["month_year", "origin"])
.size()
.reset_index(name="launches")
)
# Calculate median monthly launches per origin
median_monthly = monthly_origin_counts.groupby("origin")["launches"].median()
# Keep only origins with median monthly launches >= 50
active_origins = median_monthly[median_monthly >= 50].index
monthly_origin_counts = monthly_origin_counts[monthly_origin_counts["origin"].isin(active_origins)]
# Pivot to get origins as columns
pivot_table = monthly_origin_counts.pivot(
index="month_year", columns="origin", values="launches"
).fillna(0).astype(int)
# Add total column
pivot_table["Total"] = pivot_table.sum(axis=1)
# Sort by month (most recent first)
pivot_table = pivot_table.sort_index(ascending=False)
# Convert index to string for better display
pivot_table.index = pivot_table.index.astype(str)
return pivot_table
monthly_origin_table = create_monthly_launches_table_by_origin(df)
monthly_origin_table.style.set_caption("Monthly launches by BinderHub provider.")
Loading...