This notebook analyzes MyBinder launch data from the automated releases at the following URL:
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
def load_binder_data(
url="https://github.com/jupyterhub/binder-data/releases/download/latest/launches.parquet",
):
"""
Load MyBinder analytics data from GitHub release.
Args:
url: URL to the parquet file in GitHub releases
Returns:
DataFrame with binder launch data
"""
print(f"Loading data from: {url}")
try:
df = pd.read_parquet(url)
print(f"✓ Loaded {len(df):,} records")
print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
df["timestamp"] = pd.to_datetime(df["timestamp"])
return df
except Exception as e:
print(f"✗ Failed to load data: {e}")
raise # Re-raise the exception to fail the notebook"
# Load the data
df = load_binder_data()
Loading data from: https://github.com/jupyterhub/binder-data/releases/download/latest/launches.parquet
✓ Loaded 35,316,099 records
Date range: 2018-11-03 00:00:00+00:00 to 2025-08-20 19:06:00+00:00
Launches Over Time by Source¶
Weekly trends of Binder launches grouped by the BinderHub that launched them.
def plot_launches_by_source_over_time(df):
"""
Create interactive stacked area chart showing number of launches over time by week for each source.
Excludes the latest week of data which may be incomplete.
"""
if df.empty or "timestamp" not in df.columns or "origin" not in df.columns:
print("Missing required columns for time series analysis")
return
# Exclude the latest week of data (may be incomplete)
latest_week_start = df["timestamp"].max() - timedelta(days=7)
df_filtered = df[df["timestamp"] < latest_week_start]
# Resample by week and count launches by origin
df_weekly = (
df_filtered.set_index("timestamp").groupby("origin").resample("W", include_groups=False).size().reset_index()
)
df_weekly.columns = ["origin", "week", "launches"]
# Get top sources for better visualization
top_sources = df_filtered["origin"].value_counts().head(8).index
df_weekly_top = df_weekly[df_weekly["origin"].isin(top_sources)]
fig = px.area(
df_weekly_top,
x="week",
y="launches",
color="origin",
title="MyBinder Launches Over Time by Source (Weekly - Stacked)",
labels={"week": "Date", "launches": "Number of Launches", "origin": "Source"}
)
fig.update_layout(
height=600,
hovermode="x unified",
legend=dict(orientation="h", yanchor="top", y=-0.1, xanchor="center", x=0.5)
)
fig.show()
plot_launches_by_source_over_time(df)
Loading...
Top 25 Repositories (Last 90 Days)¶
The most popular repositories launched on MyBinder in the last 90 days.
def plot_top_repositories_last_90_days(df):
"""
Create interactive horizontal bar chart of launches in the last 90 days for top 25 repositories.
Excludes the latest week of data which may be incomplete.
"""
if df.empty or "timestamp" not in df.columns or "spec" not in df.columns:
print("Missing required columns for repository analysis")
return
# Exclude the latest week of data (may be incomplete)
latest_week_start = df["timestamp"].max() - timedelta(days=7)
# Filter to last 90 days (excluding latest week)
cutoff_date = latest_week_start - timedelta(days=90)
df_recent = df[(df["timestamp"] >= cutoff_date) & (df["timestamp"] < latest_week_start)]
# Pull only the repo so we aren't separating by the ref
def remove_ref(s):
parts = s.split("/")
# Assume it is org/repo/ref
if len(parts) == 3:
return "/".join(parts[:2])
# Assume it's something else
else:
return s
df_recent["repo"] = df_recent["spec"].map(remove_ref)
# Count launches by repository (spec)
repo_counts = df_recent["repo"].value_counts().head(50)
# Create DataFrame for plotting
plot_df = pd.DataFrame({
'repository': repo_counts.index,
'launches': repo_counts.values
})
fig = px.bar(
plot_df,
x="launches",
y="repository",
orientation="h",
title="Top 25 Repositories by Launches (Last 90 Days, excluding latest week)",
labels={"launches": "Number of Launches", "repository": "Repository"},
text="launches"
)
fig.update_layout(
height=1600,
yaxis={'categoryorder': 'total ascending'}
)
fig.update_traces(textposition='outside')
fig.show()
plot_top_repositories_last_90_days(df)
Loading...
Provider Distribution (Last 90 Days)¶
Distribution of launches by git provider (GitHub, GitLab, etc.) in the last 90 days.
def plot_provider_distribution_last_90_days(df):
"""
Create bar chart showing distribution of launches by provider in the last 90 days.
Excludes the latest week of data which may be incomplete.
"""
if df.empty or "timestamp" not in df.columns or "provider" not in df.columns:
print("Missing required columns for provider analysis")
return
# Exclude the latest week of data (may be incomplete)
latest_week_start = df["timestamp"].max() - timedelta(days=7)
# Filter to last 90 days (excluding latest week)
cutoff_date = latest_week_start - timedelta(days=90)
df_recent = df[(df["timestamp"] >= cutoff_date) & (df["timestamp"] < latest_week_start)]
# Count launches by provider
provider_counts = df_recent["provider"].value_counts()
# Create DataFrame for plotting
plot_df = pd.DataFrame({
'provider': provider_counts.index,
'launches': provider_counts.values
})
fig = px.bar(
plot_df,
x="provider",
y="launches",
title="MyBinder Launches by Provider (Last 90 Days, excluding latest week)",
labels={"launches": "Number of Launches", "provider": "Provider"},
text="launches"
)
fig.update_layout(
height=600,
xaxis={'categoryorder': 'total descending'}
)
fig.update_traces(textposition='outside')
fig.show()
plot_provider_distribution_last_90_days(df)
Loading...
def create_monthly_launches_table(df):
"""
Create a table showing monthly launches by provider for the last 12 months.
Excludes the latest week of data which may be incomplete.
"""
if df.empty or "timestamp" not in df.columns or "provider" not in df.columns:
print("Missing required columns for monthly analysis")
return
# Exclude the latest week of data (may be incomplete)
latest_week_start = df["timestamp"].max() - timedelta(days=7)
# Filter to last 12 months (excluding latest week)
cutoff_date = latest_week_start - timedelta(days=365)
df_filtered = df[(df["timestamp"] >= cutoff_date) & (df["timestamp"] < latest_week_start)]
# Create month-year column
df_filtered = df_filtered.copy()
df_filtered["month_year"] = df_filtered["timestamp"].dt.to_period("M")
# Group by month and provider, count launches
monthly_provider_counts = (
df_filtered.groupby(["month_year", "provider"])
.size()
.reset_index(name="launches")
)
# Pivot to get providers as columns
pivot_table = monthly_provider_counts.pivot(
index="month_year", columns="provider", values="launches"
).fillna(0).astype(int)
# Add total column
pivot_table["Total"] = pivot_table.sum(axis=1)
# Sort by month (most recent first)
pivot_table = pivot_table.sort_index(ascending=False)
# Convert index to string for better display
pivot_table.index = pivot_table.index.astype(str)
return pivot_table
monthly_table = create_monthly_launches_table(df)
monthly_table.style.set_caption("Monthly launches by repository provider.")
Loading...
Monthly Launches by Provider (Last 12 Months)¶
Monthly breakdown of launches by provider with totals for the last 12 months.
def create_monthly_launches_table_by_origin(df):
"""
Create a table showing monthly launches by BinderHub origin for the last 12 months.
Excludes the latest week of data which may be incomplete.
"""
if df.empty or "timestamp" not in df.columns or "origin" not in df.columns:
print("Missing required columns for monthly origin analysis")
return
# Exclude the latest week of data (may be incomplete)
latest_week_start = df["timestamp"].max() - timedelta(days=7)
# Filter to last 12 months (excluding latest week)
cutoff_date = latest_week_start - timedelta(days=365)
df_filtered = df[(df["timestamp"] >= cutoff_date) & (df["timestamp"] < latest_week_start)]
# Create month-year column
df_filtered = df_filtered.copy()
df_filtered["month_year"] = df_filtered["timestamp"].dt.to_period("M")
# Group by month and origin, count launches
monthly_origin_counts = (
df_filtered.groupby(["month_year", "origin"])
.size()
.reset_index(name="launches")
)
# Pivot to get origins as columns
pivot_table = monthly_origin_counts.pivot(
index="month_year", columns="origin", values="launches"
).fillna(0).astype(int)
# Add total column
pivot_table["Total"] = pivot_table.sum(axis=1)
# Sort by month (most recent first)
pivot_table = pivot_table.sort_index(ascending=False)
# Convert index to string for better display
pivot_table.index = pivot_table.index.astype(str)
return pivot_table
monthly_origin_table = create_monthly_launches_table_by_origin(df)
monthly_origin_table.style.set_caption("Monthly launches by BinderHub provider.")
Loading...