Awesome De Novo Peptide Sequencing
A comprehensive, interactive map of the field — algorithms, post-processors, downstream applications, and adjacent tools, deep-learning and classical alike.
import { aq, op } from "@uwdata/arquero"
pubs_t = transpose(pubs).map(r => ({ ...r, year: +r.year, date: r.date ? new Date(r.date) : null }))
top_authors_t = transpose(top_authors)
geo_t = transpose(geo)
institutions_t = transpose(institutions)
pub_authorship_t = transpose(pub_authorship)
coauth_edges_t = transpose(coauth_edges)
author_affs_t = transpose(author_affs)
algorithms_t = transpose(algorithms).map(r => ({
...r,
first_pub: r.first_pub ? new Date(r.first_pub) : null,
is_dl: r.is_dl == null ? null : !!r.is_dl // SQLite INTEGER 0/1 → JS boolean
}))
// Version-aware view of algorithms: any algorithm whose joined publications
// carry a `version` tag (currently Casanovo v1 / v2 / v5) is expanded into one
// row per version, each with its own first_pub date and a label like
// "Casanovo v2". Drives the architectures timeline so successive releases of
// the same method appear as distinct dots instead of collapsing onto the
// earliest one. All other downstream cells (counters, bipartite network,
// browse table) keep using `algorithms_t` and remain one-row-per-algorithm.
algorithms_versioned_t = {
const pubs_for = new Map()
for (const p of pubs_t) {
for (const m of (p.models ?? "").split(",").map(s => s.trim()).filter(Boolean)) {
if (!pubs_for.has(m)) pubs_for.set(m, [])
pubs_for.get(m).push(p)
}
}
const out = []
for (const r of algorithms_t) {
const its_pubs = pubs_for.get(r.model) ?? []
const versions = Array.from(new Set(its_pubs.map(p => p.version).filter(Boolean))).sort()
if (versions.length === 0) { out.push({ ...r, base_model: r.model }); continue }
for (const v of versions) {
const dates = its_pubs.filter(p => p.version === v).map(p => p.date).filter(Boolean)
const earliest = dates.length ? new Date(Math.min(...dates.map(d => +d))) : r.first_pub
out.push({ ...r, model: `${r.model} ${v}`, version: v, base_model: r.model, first_pub: earliest })
}
const unversioned = its_pubs.filter(p => !p.version)
if (unversioned.length) {
const dates = unversioned.map(p => p.date).filter(Boolean)
const earliest = dates.length ? new Date(Math.min(...dates.map(d => +d))) : r.first_pub
out.push({ ...r, base_model: r.model, first_pub: earliest })
}
}
return out
}
venues_t = transpose(venues)
author_details_t = transpose(author_details)
citations_t = transpose(citations)
journal_impact_t = transpose(journal_impact)
// Lookup table: journal name → 2yr citedness, for fast joins in OJS cells.
journal_impact_by_name = new Map(journal_impact_t.map(j => [j.journal, j]))
// Derived counters — everything flows from the data, no hardcoded numbers.
n_papers = pubs_t.length
n_models = algorithms_t.length
n_authors = new Set(pubs_t.flatMap(p => (p.authors ?? "").split(", ").filter(Boolean))).size
n_countries = geo_t.length
years_with_pubs = pubs_t.map(p => p.year).filter(y => Number.isFinite(y))
first_year = Math.min(...years_with_pubs)
last_year = Math.max(...years_with_pubs)
// First year a paper using deep learning appears in the catalog — anchors the
// "wave" prose so the DL inflection point reads from data, not a constant.
dl_algo_names = new Set(algorithms_t.filter(a => a.is_dl === true).map(a => a.model))
first_dl_year = Math.min(...pubs_t
.filter(p => Number.isFinite(p.year) && (p.models ?? "").split(",").map(s => s.trim()).some(m => dl_algo_names.has(m)))
.map(p => p.year)
)
n_preprints = pubs_t.filter(p => p.type === "preprint").length
n_peer_reviewed = pubs_t.filter(p => p.type === "peer-reviewed").length
families = Array.from(new Set(algorithms_t.map(a => a.family).filter(Boolean)))
// Classification breakdowns (kind / DL / acquisition).
n_dl = algorithms_t.filter(a => a.is_dl === true).length
n_non_dl = algorithms_t.filter(a => a.is_dl === false).length
kinds_present = Array.from(new Set(algorithms_t.map(a => a.kind).filter(Boolean))).sort()
acq_modes_present = Array.from(new Set(algorithms_t.map(a => a.acquisition).filter(Boolean))).sort()
kind_counts = kinds_present.map(k => ({ kind: k, n: algorithms_t.filter(a => a.kind === k).length }))
acq_counts = acq_modes_present.map(m => ({ acquisition: m, n: algorithms_t.filter(a => a.acquisition === m).length }))html`<div class="hero-grid">
<div class="hero-stat"><div class="hero-num">${n_papers}</div><div class="hero-lbl">papers</div></div>
<div class="hero-stat"><div class="hero-num">${n_models}</div><div class="hero-lbl">methods</div></div>
<div class="hero-stat"><div class="hero-num">${n_authors}</div><div class="hero-lbl">authors</div></div>
<div class="hero-stat"><div class="hero-num">${n_countries}</div><div class="hero-lbl">countries</div></div>
</div>`{
const kind_label = {
"algorithm": "Algorithms",
"post-processor": "Post-processors",
"downstream-application": "Downstream apps",
"adjacent": "Adjacent",
"review": "Reviews / surveys",
"benchmark": "Benchmarks",
"meta": "Meta"
}
return html`<div class="breakdown-grid">
<div class="breakdown-cell">
<div class="breakdown-title">By kind</div>
${kind_counts.map(k => html`<div class="breakdown-row">
<span class="bk-num">${k.n}</span>
<span class="bk-bar"><span class="bk-fill" style="width:${100 * k.n / n_models}%"></span></span>
<span class="bk-lbl">${kind_label[k.kind] ?? k.kind}</span>
</div>`)}
</div>
<div class="breakdown-cell">
<div class="breakdown-title">By approach</div>
<div class="breakdown-row">
<span class="bk-num">${n_dl}</span>
<span class="bk-bar"><span class="bk-fill" style="width:${100 * n_dl / n_models}%; background:#1f6feb"></span></span>
<span class="bk-lbl">Deep learning</span>
</div>
<div class="breakdown-row">
<span class="bk-num">${n_non_dl}</span>
<span class="bk-bar"><span class="bk-fill" style="width:${100 * n_non_dl / n_models}%; background:#6f42c1"></span></span>
<span class="bk-lbl">Classical</span>
</div>
</div>
<div class="breakdown-cell">
<div class="breakdown-title">By acquisition</div>
${acq_counts.map(m => html`<div class="breakdown-row">
<span class="bk-num">${m.n}</span>
<span class="bk-bar"><span class="bk-fill" style="width:${100 * m.n / n_models}%; background:#1a7f37"></span></span>
<span class="bk-lbl">${m.acquisition}</span>
</div>`)}
</div>
</div>`
}md`Since **${first_year}**, **${n_papers}** papers have introduced **${n_models}** methods for *de novo* peptide sequencing, written by **${n_authors}** authors across **${n_countries}** countries. Of those papers, **${n_preprints}** are preprints and **${n_peer_reviewed}** are peer-reviewed — a snapshot of a field where the conversation moves faster than the journals.`Scope. A comprehensive map of de novo peptide sequencing — core algorithms, post-processors (re-rankers / FDR / refinement), downstream applications (immunopeptidomics, metaproteomics, cyclopeptides), adjacent tools (database-search hybrids, glycopeptide pipelines), reviews / surveys, and benchmarks. Both deep-learning and classical methods are tracked; the filters below let you slice by approach, acquisition mode (DDA / DIA), and paper kind. Want a paper added? See Contributing.
🎚️ Filters — Kind · Approach · Acquisition. Apply across the whole page; pinned to the top while you scroll.
pubs_matches_filter = p => {
if (p.kind && !kind_filter.includes(p.kind)) return false
if (dl_filter === "DL only" && !(p.is_dl === 1 || p.is_dl === true)) return false
if (dl_filter === "Classical only" && !(p.is_dl === 0 || p.is_dl === false)) return false
if (p.acquisition && !acq_filter.includes(p.acquisition)) return false
return true
}
pubs_filtered = pubs_t.filter(pubs_matches_filter)
n_papers_filtered = pubs_filtered.lengthThe wave
md`The earliest paper tracked here appeared in **${first_year}**; the first deep-learning method shows up in **${first_dl_year}**. Activity has accelerated sharply since — **${n_peer_reviewed}** papers have made it through peer review, alongside **${n_preprints}** preprints still in the publication pipeline.`Plot.plot({
marginLeft: 50,
width: 1100,
height: 360,
x: { label: "Year", tickFormat: "d", interval: 1 },
y: { label: `Papers (${n_papers_filtered} shown)`, grid: true },
color: { legend: true, scheme: "blues", domain: ["preprint", "peer-reviewed", "ML conference", "thesis", "commentary"] },
marks: [
Plot.barY(
pubs_filtered.filter(p => Number.isFinite(p.year)),
Plot.groupX(
{ y: "count" },
{ x: "year", fill: "type", tip: true }
)
),
Plot.ruleY([0])
]
})The architectures
De novo sequencing has cycled through several methodological families — first hand-engineered dynamic programming and learning-to-rank, then a long stretch of CNN+RNN models, then transformers, GNNs, NAR variants, and most recently diffusion. Use the filters to focus on one slice of the field; hover a dot to read the method’s signature contribution.
innovations_timeline = {
// Notebook-style band order: oldest paradigm at the bottom, newest at the top.
// Classical (non-DL) bands sit at the bottom: heuristic → graph/DP → HMM →
// decision tree → random forest → learning-to-rank → CNN+RNN → ... → diffusion.
const band_order = [
"Heuristic", "Graph / DP", "HMM", "Decision tree",
"Random Forest", "Learning-to-rank",
"CNN + RNN", "Transformer (AR)", "GNN", "CNN", "Transformer (NAR)", "Diffusion", "Flow"
]
const band_color = {
"Heuristic": "#8a96a0",
"Graph / DP": "#6c757d",
"HMM": "#4d6a8c",
"Decision tree": "#7b6f43",
"Random Forest": "#a5673f",
"Learning-to-rank": "#5f6b7a",
"CNN + RNN": "#4C72B0",
"Transformer (AR)": "#DD8452",
"GNN": "#937860",
"CNN": "#8172B3",
"Transformer (NAR)": "#55A868",
"Diffusion": "#C44E52",
"Flow": "#937DC2"
}
// Tiers chosen to spread labels both above and below the band center.
// 25 interleaved tier positions (the previous 15 ran out for the 2024-2026
// Transformer (AR) burst, dropping ~6 models onto the center-line fallback).
const Y_TIERS = [
0.5, 0.8, 0.2, 0.9, 0.1, 0.7, 0.3, 0.6, 0.4,
0.85, 0.15, 0.95, 0.05, 0.75, 0.25, 0.65, 0.35, 0.55, 0.45,
0.88, 0.12, 0.78, 0.22, 0.58, 0.42
]
// MIN_GAP_DAYS is computed dynamically below from the actual x-range so it self-tunes
// when more years of data come in (e.g. older classical methods extending back to 1997).
// Per-family band heights — busy families get more vertical room (matches plots.ipynb).
// Graph / DP now holds seven classical algorithms (Sherenga, PEAKS, PepNovo, MSNovo,
// pNovo, pNovo+, CycloNovo) so it needs a taller swimlane.
const band_heights = {
"Heuristic": 1.0,
"Graph / DP": 2.5,
"HMM": 1.0,
"Decision tree": 1.0,
"Random Forest": 1.2,
"Learning-to-rank": 1.2,
"CNN + RNN": 2.0,
"Transformer (AR)": 8.0,
"GNN": 1.2,
"CNN": 1.2,
"Transformer (NAR)": 2.5,
"Diffusion": 1.2,
"Flow": 1.2
}
// Apply every filter (family, kind, approach, acquisition) up front.
const matches_filters = a => {
if (!a.first_pub || !a.family) return false
if (!family_filter.includes(a.family)) return false
if (!kind_filter.includes(a.kind)) return false
if (dl_filter === "DL only" && a.is_dl !== true) return false
if (dl_filter === "Classical only" && a.is_dl !== false) return false
if (a.acquisition && !acq_filter.includes(a.acquisition)) return false
return true
}
// Only include bands that have at least one surviving model.
const present = new Set(algorithms_versioned_t.filter(matches_filters).map(a => a.family))
const visible_bands = band_order.filter(f => present.has(f))
// Stack bands bottom-to-top, each at its own height.
let y_cursor = 0
const bands = visible_bands.map(family => {
const h = band_heights[family] ?? 1.2
const band = { family, y0: y_cursor, y1: y_cursor + h, center: y_cursor + h / 2, height: h }
y_cursor += h
return band
})
const total_height = y_cursor
const band_index = new Map(bands.map(b => [b.family, b]))
// Assign each model a tier so labels at similar dates don't overlap; the tier fraction
// is interpreted relative to the *band's* height so taller bands spread further apart.
// The band_index check is a safety net for any future family name that's not yet listed
// X-range computed first so MIN_GAP_DAYS can self-tune to the actual scale.
const all_dates = algorithms_versioned_t.filter(a => a.first_pub).map(a => a.first_pub)
const x_min = d3.min(all_dates)
const x_max = d3.max(all_dates)
// Pad each side so labels at the temporal extremes (DeepNovo, DiffuNovo) stay inside the band.
const x_pad_min = d3.timeMonth.offset(x_min, -8)
const x_pad_max = d3.timeMonth.offset(x_max, 8)
// MIN_GAP_DAYS: how many days of horizontal space we want between two labels on
// the same tier. We compute it from the actual x-scale (so the spacing self-tunes
// when the timeline extends) but cap it so dense bursts (e.g. 2024-2026 Transformer
// (AR) papers) don't all fall through to the center-line fallback.
const PLOT_WIDTH_PX = 1200 // chart 1400 minus left+right margins
const LABEL_RESERVE_PX = 70 // median label width across all current entries
const MAX_GAP_DAYS = 550 // cap ~1.5 yr — beyond this dense bands burst
const total_days = (x_pad_max - x_pad_min) / 86400000
const px_per_day = PLOT_WIDTH_PX / total_days
const MIN_GAP_DAYS = Math.min(MAX_GAP_DAYS, LABEL_RESERVE_PX / px_per_day)
// in band_order — without it the chart would crash on .y0 of undefined.
const rows = algorithms_versioned_t
.filter(a => matches_filters(a) && band_index.has(a.family))
.slice()
.sort((a, b) => a.first_pub - b.first_pub)
const last_date_at_tier = new Map()
const items = []
for (const row of rows) {
let placed = false
for (let ti = 0; ti < Y_TIERS.length; ti++) {
const key = `${row.family}|${ti}`
const last = last_date_at_tier.get(key)
if (last === undefined || (row.first_pub - last) / 86400000 >= MIN_GAP_DAYS) {
last_date_at_tier.set(key, row.first_pub)
const band = band_index.get(row.family)
items.push({ ...row, y: band.y0 + band.height * Y_TIERS[ti], y_frac: Y_TIERS[ti] })
placed = true
break
}
}
if (!placed) {
const band = band_index.get(row.family)
items.push({ ...row, y: band.center, y_frac: 0.5 })
}
}
const chart = Plot.plot({
width: 1400,
height: Math.max(420, total_height * 70),
marginLeft: 140,
marginRight: 60,
marginBottom: 40,
x: { type: "time", label: "First publication →", domain: [x_pad_min, x_pad_max], grid: true },
y: { domain: [0, total_height], axis: null },
color: { domain: band_order, range: band_order.map(f => band_color[f]), legend: false },
marks: [
// Background band per family — extends to the padded x domain so edge labels stay inside.
Plot.rect(bands, {
x1: () => x_pad_min, x2: () => x_pad_max,
y1: "y0", y2: "y1",
fill: "family",
fillOpacity: 0.09
}),
// Thin separators between bands
Plot.ruleY(bands.flatMap(b => [b.y0, b.y1]), { stroke: "#ddd", strokeWidth: 0.5 }),
// Left-edge family label
Plot.text(bands, {
x: () => x_pad_min,
y: "center",
text: "family",
textAnchor: "end",
dx: -8,
fontSize: 12,
fontWeight: "bold",
fill: "family"
}),
// Dots
Plot.dot(items, {
x: "first_pub",
y: "y",
fill: "family",
r: 7,
stroke: "white",
strokeWidth: 1.5,
tip: true,
title: d => `${d.model}\n${d.family}\n${d.description ?? ""}\n${d.first_pub.toISOString().slice(0,10)}`
}),
// Labels anchored above the dot (top half of band)
Plot.text(items.filter(d => d.y_frac >= 0.5), {
x: "first_pub",
y: "y",
text: "model",
textAnchor: "middle",
lineAnchor: "bottom",
dy: -10,
fontSize: 10,
fontWeight: "bold",
fill: "family"
}),
// Labels anchored below the dot (bottom half of band)
Plot.text(items.filter(d => d.y_frac < 0.5), {
x: "first_pub",
y: "y",
text: "model",
textAnchor: "middle",
lineAnchor: "top",
dy: 10,
fontSize: 10,
fontWeight: "bold",
fill: "family"
})
]
})
// Wrap in a horizontally-scrollable container with a fullscreen button.
return html`<div class="chart-wrap">
<button class="fs-btn" onclick="
const el = this.parentElement;
if (document.fullscreenElement) document.exitFullscreen();
else el.requestFullscreen();
">⛶ Fullscreen</button>
<div class="chart-scroll">${chart}</div>
</div>`
}Where the work happens
Authors and the institutions behind them span countries. Pan and zoom the map — zoom in past about 2× and country circles split into the individual cities behind them. Circle area scales with the chosen metric; fill colour shows the quartile rank.
where_the_work_map = {
// ===== Aggregate from the currently-filtered pubs =====
const pub_ids = new Set(pubs_filtered.map(p => p.id))
// city-level buckets
const cityKey = r => `${r.city_id}` // city_id uniquely identifies a city
const city_rows = new Map()
// country-level buckets (centroid computed from member cities, weighted by author count)
const country_rows = new Map()
for (const r of pub_authorship_t) {
if (!pub_ids.has(r.pub_id) || !r.country) continue
// country aggregate
if (!country_rows.has(r.country)) {
country_rows.set(r.country, { country: r.country, authors: new Set(), affiliations: new Set(),
lat_sum: 0, lng_sum: 0, lat_w: 0 })
}
const cr = country_rows.get(r.country)
cr.authors.add(r.author_id); cr.affiliations.add(r.affiliation_id)
if (r.lat != null && r.lng != null) {
cr.lat_sum += r.lat; cr.lng_sum += r.lng; cr.lat_w += 1
}
// city aggregate (only rows with a geocoded city)
if (r.lat == null || r.lng == null || !r.city_id) continue
const k = cityKey(r)
if (!city_rows.has(k)) {
city_rows.set(k, { city: r.city, country: r.country, lat: r.lat, lng: r.lng,
authors: new Set(), affiliations: new Set() })
}
const cc = city_rows.get(k)
cc.authors.add(r.author_id); cc.affiliations.add(r.affiliation_id)
}
const city_data = Array.from(city_rows.values()).map(r => ({
city: r.city, country: r.country, lat: r.lat, lng: r.lng,
authors: r.authors.size, affiliations: r.affiliations.size
}))
const country_data = Array.from(country_rows.values()).map(r => ({
country: r.country,
authors: r.authors.size, affiliations: r.affiliations.size,
lat: r.lat_w ? r.lat_sum / r.lat_w : 0,
lng: r.lat_w ? r.lng_sum / r.lat_w : 0
})).filter(r => r.lat_w !== 0 || (r.lat !== 0 && r.lng !== 0))
const metric = country_metric // 'authors' | 'affiliations'
// ===== Set up scales =====
const width = 1100, height = 560
const projection = d3.geoNaturalEarth1()
.fitExtent([[10, 10], [width - 10, height - 10]],
topojson.feature(world_atlas, world_atlas.objects.countries))
const path = d3.geoPath(projection)
// Size: sqrt(metric) so area is linear in metric.
const max_city = d3.max(city_data, d => d[metric]) || 1
const max_country = d3.max(country_data, d => d[metric]) || 1
const cityR = v => 2 + 12 * Math.sqrt(v / max_city)
const countryR = v => 6 + 22 * Math.sqrt(v / max_country)
// Color: quartile rank of the metric in the active layer.
const quartiles = (arr, key) => {
const sorted = arr.map(d => d[key]).filter(v => v > 0).sort(d3.ascending)
return [
d3.quantile(sorted, 0.25),
d3.quantile(sorted, 0.50),
d3.quantile(sorted, 0.75)
]
}
const palette = ["#cfe2f3", "#9ec5e8", "#5b8edb", "#1f4ec7"]
const colorFor = (v, q) => v <= q[0] ? palette[0] : v <= q[1] ? palette[1] : v <= q[2] ? palette[2] : palette[3]
const cityQ = quartiles(city_data, metric)
const countryQ = quartiles(country_data, metric)
// ===== Build SVG =====
const svg = d3.create("svg")
.attr("viewBox", [0, 0, width, height])
.attr("style", "max-width:100%; height:auto; background:#f6f8fa; font:11px sans-serif;")
const g = svg.append("g")
// Country fills (greyed base map).
const countries_feature = topojson.feature(world_atlas, world_atlas.objects.countries)
g.append("g")
.selectAll("path")
.data(countries_feature.features)
.join("path")
.attr("d", path)
.attr("fill", "#e9ecef")
.attr("stroke", "#cfd4d9")
.attr("stroke-width", 0.5)
// Two overlay layers — country-level (visible at low zoom) and city-level (at high zoom).
const country_layer = g.append("g").attr("class", "country-layer")
country_layer.selectAll("circle")
.data(country_data)
.join("circle")
.attr("cx", d => projection([d.lng, d.lat])[0])
.attr("cy", d => projection([d.lng, d.lat])[1])
.attr("r", d => countryR(d[metric]))
.attr("fill", d => colorFor(d[metric], countryQ))
.attr("fill-opacity", 0.78)
.attr("stroke", "#1f4ec7")
.attr("stroke-width", 0.6)
.append("title").text(d => `${d.country}\n${d.authors} authors · ${d.affiliations} affiliations`)
const city_layer = g.append("g").attr("class", "city-layer").attr("opacity", 0)
city_layer.selectAll("circle")
.data(city_data)
.join("circle")
.attr("cx", d => projection([d.lng, d.lat])[0])
.attr("cy", d => projection([d.lng, d.lat])[1])
.attr("r", d => cityR(d[metric]))
.attr("fill", d => colorFor(d[metric], cityQ))
.attr("fill-opacity", 0.85)
.attr("stroke", "#1f4ec7")
.attr("stroke-width", 0.5)
.append("title").text(d => `${d.city}, ${d.country}\n${d.authors} authors · ${d.affiliations} affiliations`)
// ===== Zoom behaviour with the city/country threshold =====
const SPLIT_K = 2.0 // above this zoom factor, switch to city-level
const zoom = d3.zoom()
.scaleExtent([1, 8])
.on("zoom", ev => {
g.attr("transform", ev.transform)
// Keep stroke widths visually constant under zoom.
g.selectAll("path").attr("stroke-width", 0.5 / Math.sqrt(ev.transform.k))
const k = ev.transform.k
const show_cities = k >= SPLIT_K
country_layer.attr("opacity", show_cities ? 0 : 1).attr("pointer-events", show_cities ? "none" : "auto")
city_layer.attr("opacity", show_cities ? 1 : 0).attr("pointer-events", show_cities ? "auto" : "none")
// Circle radii also shrink with zoom so they don't grow unbounded.
country_layer.selectAll("circle").attr("r", d => countryR(d[metric]) / Math.sqrt(k))
city_layer.selectAll("circle").attr("r", d => cityR(d[metric]) / Math.sqrt(k))
})
svg.call(zoom)
// ===== Color legend (bottom-left) =====
const legend = svg.append("g").attr("transform", `translate(20, ${height - 38})`)
legend.append("text").attr("x", 0).attr("y", -8)
.attr("font-weight", "bold").attr("fill", "#2a3140")
.text(`Circles sized & colored by ${metric} · zoom in past ${SPLIT_K}× for city detail`)
palette.forEach((c, i) => {
legend.append("rect").attr("x", i * 110).attr("y", 0).attr("width", 100).attr("height", 12).attr("fill", c)
legend.append("text").attr("x", i * 110).attr("y", 26).attr("fill", "#2a3140").text(["Q1 (lowest)","Q2","Q3","Q4 (highest)"][i])
})
return html`<div class="chart-wrap">
<button class="fs-btn" onclick="
const el = this.parentElement;
if (document.fullscreenElement) document.exitFullscreen();
else el.requestFullscreen();
">⛶ Fullscreen</button>
${svg.node()}
</div>`
}Top institutions
{
// Top-15 institutions by distinct authors, recomputed from filtered pubs so
// the chart tracks the global Kind / Approach / Acquisition filters.
const pub_ids = new Set(pubs_filtered.map(p => p.id))
const authors_by_inst = new Map() // affiliation → Set(author_id)
const country_of = new Map() // affiliation → country (for color)
for (const r of pub_authorship_t) {
if (!pub_ids.has(r.pub_id) || !r.affiliation) continue
if (!authors_by_inst.has(r.affiliation)) authors_by_inst.set(r.affiliation, new Set())
authors_by_inst.get(r.affiliation).add(r.author_id)
if (r.country) country_of.set(r.affiliation, r.country)
}
const rows = Array.from(authors_by_inst, ([institution, authors]) => ({
institution,
country: country_of.get(institution) ?? "—",
authors: authors.size
}))
.sort((a, b) => b.authors - a.authors)
.slice(0, 15)
return Plot.plot({
marginLeft: 280,
width: 1100,
height: Math.max(260, rows.length * 26),
x: { label: `Distinct authors (${pubs_filtered.length} papers)`, grid: true },
y: { label: null },
marks: [
Plot.barX(rows, {
x: "authors",
y: d => `${d.institution} — ${d.country}`,
fill: "country",
sort: { y: "x", reverse: true },
tip: true
}),
Plot.ruleX([0])
]
})
}Who’s driving it
The chart shows the twenty most-published authors. The network below it shows how authors with ≥ 3 papers are connected through co-authorship; drag a node to reshape the layout, or hover to highlight a neighborhood.
{
// Re-aggregate top authors from the currently-filtered pubs (so the chart
// reacts to the global Kind / Approach / Acquisition filters).
const counts = new Map()
for (const p of pubs_filtered) {
for (const a of (p.authors ?? "").split(", ").filter(Boolean)) {
counts.set(a, (counts.get(a) ?? 0) + 1)
}
}
const top = Array.from(counts, ([name, publications]) => ({ name, publications }))
.sort((a, b) => b.publications - a.publications)
.slice(0, 20)
return Plot.plot({
marginLeft: 180,
height: Math.max(260, top.length * 22),
x: { label: `Papers in current filter (${pubs_filtered.length} total)`, grid: true },
y: { label: null },
marks: [
Plot.barX(top, {
x: "publications",
y: "name",
fill: "#1f6feb",
sort: { y: "x", reverse: true },
tip: true
}),
Plot.ruleX([0])
]
})
}network_chart = {
const width = 1400
const height = 760
const LEGEND_H = 110
const PLOT_H = height - LEGEND_H
// All affiliations per author (an author can have multiple).
const affs_by_author = d3.rollup(
author_affs_t,
v => Array.from(new Set(v.map(d => d.affiliation))).sort(),
d => d.author
)
// Nodes from the edges (we only render authors with ≥3 papers; that's what coauth_edges_t holds).
const node_set = new Set()
for (const e of coauth_edges_t) { node_set.add(e.source); node_set.add(e.target) }
const nodes = Array.from(node_set, name => ({
id: name,
affiliations: affs_by_author.get(name) ?? ["Unknown"],
degree: 0
}))
const node_by_name = new Map(nodes.map(n => [n.id, n]))
const links = coauth_edges_t.map(e => ({ source: e.source, target: e.target, weight: e.weight }))
for (const l of links) {
node_by_name.get(l.source).degree += l.weight
node_by_name.get(l.target).degree += l.weight
}
const max_deg = d3.max(nodes, n => n.degree) || 1
// Unique affiliations sorted for legend + a 22-slot palette so we don't run out of colors.
const all_affs = Array.from(new Set(nodes.flatMap(n => n.affiliations))).sort()
const palette = d3.schemeTableau10.concat(d3.schemeSet3)
const color = d3.scaleOrdinal().domain(all_affs).range(all_affs.map((_, i) => palette[i % palette.length]))
// Primary affiliation = the one shared with most neighbors (for edge coloring).
const adj = new Map(nodes.map(n => [n.id, []]))
for (const l of links) { adj.get(l.source).push(l.target); adj.get(l.target).push(l.source) }
const primary_aff = new Map()
for (const n of nodes) {
if (n.affiliations.length === 1) { primary_aff.set(n.id, n.affiliations[0]); continue }
let best = n.affiliations[0], best_count = -1
for (const aff of n.affiliations) {
const c = adj.get(n.id).reduce((acc, nb) => acc + (node_by_name.get(nb).affiliations.includes(aff) ? 1 : 0), 0)
if (c > best_count) { best = aff; best_count = c }
}
primary_aff.set(n.id, best)
}
// Within-component clustering force: pull nodes of the same primary-aff together.
const aff_centers = new Map()
for (const aff of all_affs) {
const i = all_affs.indexOf(aff)
aff_centers.set(aff, [
width / 2 + Math.cos(i * 2 * Math.PI / all_affs.length) * width * 0.3,
PLOT_H / 2 + Math.sin(i * 2 * Math.PI / all_affs.length) * PLOT_H * 0.35
])
}
const NODE_R = d => 8 + 10 * Math.sqrt(d.degree / max_deg)
const sim = d3.forceSimulation(nodes)
.force("link", d3.forceLink(links).id(d => d.id).distance(70).strength(d => 0.2 + 0.04 * d.weight))
.force("charge", d3.forceManyBody().strength(-220))
.force("collide", d3.forceCollide().radius(d => NODE_R(d) + 4))
.force("center", d3.forceCenter(width / 2, PLOT_H / 2))
.force("aff_x", d3.forceX(d => aff_centers.get(primary_aff.get(d.id))[0]).strength(0.05))
.force("aff_y", d3.forceY(d => aff_centers.get(primary_aff.get(d.id))[1]).strength(0.05))
const svg = d3.create("svg")
.attr("viewBox", [0, 0, width, height])
.attr("style", "max-width: 100%; height: auto; font: 11px sans-serif; cursor: grab;")
// Title
svg.append("text")
.attr("x", width / 2)
.attr("y", 22)
.attr("text-anchor", "middle")
.attr("font-size", 16)
.attr("font-weight", "bold")
.attr("fill", "#24292f")
.text("Co-authorship network — authors with ≥ 3 papers, pie nodes for multi-affiliation")
const g = svg.append("g")
svg.call(d3.zoom().on("zoom", ev => g.attr("transform", ev.transform)))
// Edges: tint same-primary-affiliation edges with the affiliation color; others gray.
const link = g.append("g")
.attr("stroke-opacity", 0.45)
.selectAll("line")
.data(links)
.join("line")
.attr("stroke", d => {
const a = primary_aff.get(d.source.id ?? d.source)
const b = primary_aff.get(d.target.id ?? d.target)
return a && a === b ? color(a) : "#bbb"
})
.attr("stroke-width", d => Math.max(1, Math.sqrt(d.weight) * 0.8))
// Node groups (one <g> per author; contains either a circle or pie slices).
const node_g = g.append("g")
.selectAll("g.node")
.data(nodes)
.join("g")
.attr("class", "node")
.call(drag(sim))
node_g.append("title").text(d =>
`${d.id}\n${d.affiliations.join(" + ")}\n${d.degree} co-author links`
)
// Render each node: single circle if 1 affiliation, pie wedges if multiple.
node_g.each(function (d) {
const r = NODE_R(d)
const sel = d3.select(this)
if (d.affiliations.length === 1) {
sel.append("circle")
.attr("r", r)
.attr("fill", color(d.affiliations[0]))
.attr("stroke", "#fff")
.attr("stroke-width", 1.5)
} else {
const arc = d3.arc().innerRadius(0).outerRadius(r)
const pie = d3.pie().value(1).sort(null)(d.affiliations.map(a => ({ aff: a })))
sel.selectAll("path")
.data(pie)
.join("path")
.attr("d", arc)
.attr("fill", p => color(p.data.aff))
.attr("stroke", "#fff")
.attr("stroke-width", 1)
}
})
const label = g.append("g")
.attr("pointer-events", "none")
.selectAll("text")
.data(nodes)
.join("text")
.text(d => d.id)
.attr("font-size", 10)
.attr("font-weight", 500)
.attr("fill", "#1d2330")
.attr("dx", d => NODE_R(d) + 3)
.attr("dy", 3)
sim.on("tick", () => {
link
.attr("x1", d => d.source.x).attr("y1", d => d.source.y)
.attr("x2", d => d.target.x).attr("y2", d => d.target.y)
node_g.attr("transform", d => `translate(${d.x}, ${d.y})`)
label.attr("x", d => d.x).attr("y", d => d.y)
})
// Affiliation legend at the bottom.
const legend_g = svg.append("g")
.attr("transform", `translate(40, ${PLOT_H + 10})`)
legend_g.append("text")
.attr("x", 0).attr("y", 0)
.attr("font-weight", "bold")
.attr("font-size", 12)
.attr("fill", "#57606a")
.text(`Affiliation (${all_affs.length})`)
const cols_per_row = 5
legend_g.selectAll("g.leg-item")
.data(all_affs)
.join("g")
.attr("class", "leg-item")
.attr("transform", (_, i) => `translate(${(i % cols_per_row) * 260}, ${20 + Math.floor(i / cols_per_row) * 18})`)
.call(s => {
s.append("circle").attr("r", 6).attr("cx", 6).attr("cy", -4).attr("fill", color)
s.append("text")
.attr("x", 18).attr("y", 0)
.attr("font-size", 11)
.attr("fill", "#24292f")
.text(d => d.length > 32 ? d.slice(0, 30) + "…" : d)
})
invalidation.then(() => sim.stop())
function drag(simulation) {
return d3.drag()
.on("start", (ev, d) => { if (!ev.active) simulation.alphaTarget(0.3).restart(); d.fx = d.x; d.fy = d.y })
.on("drag", (ev, d) => { d.fx = ev.x; d.fy = ev.y })
.on("end", (ev, d) => { if (!ev.active) simulation.alphaTarget(0); d.fx = null; d.fy = null })
}
return html`<div class="chart-wrap">
<button class="fs-btn" onclick="
const el = this.parentElement;
if (document.fullscreenElement) document.exitFullscreen();
else el.requestFullscreen();
">⛶ Fullscreen</button>
<div class="chart-scroll">${svg.node()}</div>
</div>`
}Models and the authors behind them
This second view rewires the same network as a bipartite graph: every prolific author (≥ 3 papers) is linked to the models they helped publish. Algorithm nodes are diamonds colored by their architecture family — so you can see which research groups own which slice of the architectural landscape.
author_algo_network = {
const width = 1400
const height = 760
const LEGEND_H = 90
const PLOT_H = height - LEGEND_H
// Same prolific-author filter as the co-auth network (authors with ≥ 3 papers).
const prolific = new Set(coauth_edges_t.flatMap(e => [e.source, e.target]))
// Author → algorithm edges, weighted by number of papers connecting them.
// When a publication carries a version tag (Casanovo v1/v2/v5), the edge
// points at the version-suffixed label so each release becomes its own node.
const edge_w = new Map() // key: `${author}|${algo}`
const algo_to_base = new Map() // versioned label → base algorithm name (for family lookup)
for (const p of pubs_t) {
if (!p.authors || !p.models) continue
const authors = String(p.authors).split(",").map(s => s.trim()).filter(Boolean)
const models = String(p.models).split(",").map(s => s.trim()).filter(Boolean)
for (const a of authors) {
if (!prolific.has(a)) continue
for (const m of models) {
const label = p.version ? `${m} ${p.version}` : m
algo_to_base.set(label, m)
const key = a + "<<|>>" + label
edge_w.set(key, (edge_w.get(key) ?? 0) + 1)
}
}
}
const algo_meta = new Map(algorithms_t.map(a => [a.model, a]))
const links = []
const author_set = new Set()
const algo_set = new Set()
for (const [k, w] of edge_w) {
const [a, m] = k.split("<<|>>")
author_set.add(a); algo_set.add(m)
links.push({ source: a, target: m, weight: w })
}
const nodes = [
...Array.from(author_set, name => ({ id: name, kind: "author", degree: 0 })),
...Array.from(algo_set, name => {
const base = algo_to_base.get(name) ?? name
const meta = algo_meta.get(base)
return { id: name, kind: "algo", family: meta?.family ?? "Unknown", degree: 0 }
})
]
const node_by_name = new Map(nodes.map(n => [n.id, n]))
for (const l of links) {
node_by_name.get(l.source).degree += l.weight
node_by_name.get(l.target).degree += l.weight
}
// Color palette: gray for authors, family color for algorithms.
const family_color = {
"CNN + RNN": "#4C72B0",
"Transformer (AR)": "#DD8452",
"GNN": "#937860",
"CNN": "#8172B3",
"Transformer (NAR)": "#55A868",
"Diffusion": "#C44E52",
"Flow": "#937DC2",
"Unknown": "#999999"
}
const nodeFill = d => d.kind === "author" ? "#cdd6e0" : (family_color[d.family] ?? "#999")
const nodeStroke = d => d.kind === "author" ? "#8a94a3" : "#222"
const max_algo_deg = d3.max(nodes.filter(n => n.kind === "algo"), n => n.degree) || 1
const max_auth_deg = d3.max(nodes.filter(n => n.kind === "author"), n => n.degree) || 1
const nodeR = d => d.kind === "author"
? 5 + 8 * Math.sqrt(d.degree / max_auth_deg)
: 9 + 14 * Math.sqrt(d.degree / max_algo_deg)
// Pull algorithm nodes toward a central ring so they cluster by family.
const families_seen = Array.from(new Set(nodes.filter(n => n.kind === "algo").map(n => n.family))).sort()
const family_target = new Map()
for (let i = 0; i < families_seen.length; i++) {
const angle = i * 2 * Math.PI / families_seen.length
family_target.set(families_seen[i], [
width / 2 + Math.cos(angle) * width * 0.25,
PLOT_H / 2 + Math.sin(angle) * PLOT_H * 0.30
])
}
const sim = d3.forceSimulation(nodes)
.force("link", d3.forceLink(links).id(d => d.id).distance(d => 60 + 8 / d.weight).strength(0.35))
.force("charge", d3.forceManyBody().strength(d => d.kind === "algo" ? -350 : -90))
.force("collide", d3.forceCollide().radius(d => nodeR(d) + 4))
.force("center", d3.forceCenter(width / 2, PLOT_H / 2))
.force("fam_x", d3.forceX(d => d.kind === "algo" ? family_target.get(d.family)[0] : width / 2).strength(d => d.kind === "algo" ? 0.08 : 0.01))
.force("fam_y", d3.forceY(d => d.kind === "algo" ? family_target.get(d.family)[1] : PLOT_H / 2).strength(d => d.kind === "algo" ? 0.08 : 0.01))
const svg = d3.create("svg")
.attr("viewBox", [0, 0, width, height])
.attr("style", "max-width: 100%; height: auto; font: 11px sans-serif; cursor: grab;")
svg.append("text")
.attr("x", width / 2).attr("y", 22)
.attr("text-anchor", "middle")
.attr("font-size", 16)
.attr("font-weight", "bold")
.attr("fill", "#24292f")
.text("Authors ↔ models — diamonds are algorithms colored by family, circles are authors")
const g = svg.append("g")
svg.call(d3.zoom().on("zoom", ev => g.attr("transform", ev.transform)))
const link = g.append("g")
.attr("stroke", "#aaa")
.attr("stroke-opacity", 0.45)
.selectAll("line")
.data(links)
.join("line")
.attr("stroke-width", d => Math.max(0.8, Math.sqrt(d.weight) * 0.7))
const node_g = g.append("g")
.selectAll("g.node")
.data(nodes)
.join("g")
.attr("class", "node")
.call(drag(sim))
node_g.append("title").text(d => d.kind === "algo"
? `${d.id} (algorithm)\nFamily: ${d.family}\n${d.degree} author links`
: `${d.id} (author)\n${d.degree} model links`
)
// Authors as circles; algorithms as diamonds (rotated squares).
node_g.each(function (d) {
const sel = d3.select(this)
const r = nodeR(d)
if (d.kind === "author") {
sel.append("circle")
.attr("r", r)
.attr("fill", nodeFill(d))
.attr("stroke", nodeStroke(d))
.attr("stroke-width", 1.2)
} else {
sel.append("rect")
.attr("x", -r).attr("y", -r)
.attr("width", r * 2).attr("height", r * 2)
.attr("transform", "rotate(45)")
.attr("fill", nodeFill(d))
.attr("stroke", nodeStroke(d))
.attr("stroke-width", 1.5)
}
})
const label = g.append("g")
.attr("pointer-events", "none")
.selectAll("text")
.data(nodes)
.join("text")
.text(d => d.id)
.attr("font-size", d => d.kind === "algo" ? 12 : 9)
.attr("font-weight", d => d.kind === "algo" ? "bold" : 500)
.attr("fill", d => d.kind === "algo" ? "#0b0d12" : "#3b4150")
.attr("dx", d => nodeR(d) + 3)
.attr("dy", 3)
sim.on("tick", () => {
link
.attr("x1", d => d.source.x).attr("y1", d => d.source.y)
.attr("x2", d => d.target.x).attr("y2", d => d.target.y)
node_g.attr("transform", d => `translate(${d.x}, ${d.y})`)
label.attr("x", d => d.x).attr("y", d => d.y)
})
// Legend: algorithm-family colors + the author swatch.
const legend_g = svg.append("g").attr("transform", `translate(40, ${PLOT_H + 15})`)
legend_g.append("text")
.attr("x", 0).attr("y", 0)
.attr("font-weight", "bold")
.attr("font-size", 12)
.attr("fill", "#57606a")
.text("Algorithm family")
const legend_items = families_seen.map(f => ({ label: f, fill: family_color[f] ?? "#999", shape: "diamond" }))
.concat([{ label: "Author (size = # model links)", fill: "#cdd6e0", shape: "circle" }])
legend_g.selectAll("g.leg-item")
.data(legend_items)
.join("g")
.attr("class", "leg-item")
.attr("transform", (_, i) => `translate(${(i % 5) * 260}, ${22 + Math.floor(i / 5) * 22})`)
.call(s => {
s.append(d => d.shape === "diamond"
? document.createElementNS("http://www.w3.org/2000/svg", "rect")
: document.createElementNS("http://www.w3.org/2000/svg", "circle"))
.attr("transform", d => d.shape === "diamond" ? "rotate(45) translate(0,0)" : null)
.each(function (d) {
const el = d3.select(this)
if (d.shape === "diamond") el.attr("x", -7).attr("y", -7).attr("width", 14).attr("height", 14)
else el.attr("r", 7).attr("cx", 0).attr("cy", 0)
el.attr("fill", d.fill).attr("stroke", d.fill === "#cdd6e0" ? "#8a94a3" : "#222").attr("stroke-width", 1.2)
})
s.append("text")
.attr("x", 14).attr("y", 4)
.attr("font-size", 11)
.attr("fill", "#24292f")
.text(d => d.label)
})
invalidation.then(() => sim.stop())
function drag(simulation) {
return d3.drag()
.on("start", (ev, d) => { if (!ev.active) simulation.alphaTarget(0.3).restart(); d.fx = d.x; d.fy = d.y })
.on("drag", (ev, d) => { d.fx = ev.x; d.fy = ev.y })
.on("end", (ev, d) => { if (!ev.active) simulation.alphaTarget(0); d.fx = null; d.fy = null })
}
return html`<div class="chart-wrap">
<button class="fs-btn" onclick="
const el = this.parentElement;
if (document.fullscreenElement) document.exitFullscreen();
else el.requestFullscreen();
">⛶ Fullscreen</button>
<div class="chart-scroll">${svg.node()}</div>
</div>`
}How the field cites itself
A chronological citation arc diagram. Papers are placed left-to-right by publication date and stratified vertically by kind; within each row, the most-cited papers float to the top. Each arc connects a citing paper (right end) to a paper it cites (left end), curving upward above the row. Hover any paper to highlight the citations into it (red) and out of it (blue), and dim everything else.
Edges resolved from Crossref (by DOI) and Semantic Scholar (by DOI or title-search fallback), matched back to publications via DOI-exact (and, for refs without a DOI, fuzzy-title with token-set ratio ≥ 92). Only intra-catalog citations are drawn — references to papers outside the catalog are filtered out. Every arrow runs citing → cited, so the arrowhead always lands on the older paper.
citation_arcs = {
if (!citations_t.length) {
return html`<p style="color:#57606a; font-style:italic; padding:1rem;">
No citation edges yet. Run <code>uv run python build_citations.py</code> to populate the graph.
</p>`
}
// ===== Layout parameters =====
const width = 1600
const height = 820
const marginTop = 120 // extra headroom for arc apexes near the top rows
const marginBottom = 60
const marginLeft = 130
const marginRight = 30
const innerWidth = width - marginLeft - marginRight
const innerHeight = height - marginTop - marginBottom
// ===== Per-publication node data =====
const pub_by_id = new Map(pubs_t.map(p => [p.id, p]))
const cite_count = new Map() // pub_id → in-degree (times cited by other catalog papers)
const cite_out = new Map() // pub_id → out-degree (papers in catalog it cites)
for (const e of citations_t) {
cite_count.set(e.cited_id, (cite_count.get(e.cited_id) ?? 0) + 1)
cite_out.set(e.citing_id, (cite_out.get(e.citing_id) ?? 0) + 1)
}
// Y-strata by kind. Meta types at the top, core algorithms at the bottom — so
// arcs (which always curve upward) have the most headroom for citations heading
// *into* heavily-cited algorithm-row papers.
const KIND_ORDER = [
{ kind: "meta", label: "Meta" },
{ kind: "benchmark", label: "Benchmarks" },
{ kind: "review", label: "Reviews / surveys" },
{ kind: "adjacent", label: "Adjacent" },
{ kind: "downstream-application", label: "Downstream apps" },
{ kind: "post-processor", label: "Post-processors" },
{ kind: "algorithm", label: "Algorithms" }
]
const kind_color = {
"algorithm": "#1f6feb",
"post-processor": "#bf8700",
"downstream-application": "#1a7f37",
"adjacent": "#a371f7",
"review": "#cf222e",
"benchmark": "#0969da",
"meta": "#8c959f",
"unknown": "#8c959f"
}
const row_h = innerHeight / KIND_ORDER.length
const row_index = new Map(KIND_ORDER.map((d, i) => [d.kind, i]))
// Only include papers that participate in ≥ 1 edge.
const involved = new Set()
for (const e of citations_t) { involved.add(e.citing_id); involved.add(e.cited_id) }
const nodes = Array.from(involved, id => {
const p = pub_by_id.get(id) ?? {}
const d = p.date instanceof Date ? p.date : new Date(p.date ?? Date.now())
return {
id,
title: p.title ?? `#${id}`,
year: p.year,
date: d,
kind: p.kind ?? "unknown",
models: p.models ?? "",
in_deg: cite_count.get(id) ?? 0,
out_deg: cite_out.get(id) ?? 0
}
}).filter(n => n.date instanceof Date && !isNaN(n.date))
// X scale: publication date.
const x_extent = d3.extent(nodes, n => n.date)
const x_pad = (x_extent[1] - x_extent[0]) * 0.02 || 1e9
const xScale = d3.scaleTime()
.domain([new Date(+x_extent[0] - x_pad), new Date(+x_extent[1] + x_pad)])
.range([0, innerWidth])
// Within-row jitter: rank papers in each row by date so they bucket into vertical lanes.
const max_in = d3.max(nodes, n => n.in_deg) || 1
for (const n of nodes) {
const ri = row_index.get(n.kind) ?? KIND_ORDER.length - 1
const top = ri * row_h + 16
const bot = (ri + 1) * row_h - 16
// Top of row = most-cited; spread less-cited papers downward.
const r = 1 - Math.sqrt(n.in_deg / max_in)
n.y = top + r * (bot - top)
n.x = xScale(n.date)
}
const node_by_id = new Map(nodes.map(n => [n.id, n]))
const links = citations_t
.map(e => ({ source: node_by_id.get(e.citing_id), target: node_by_id.get(e.cited_id), source_kind: e.source }))
.filter(l => l.source && l.target)
// ===== Draw =====
const svg = d3.create("svg")
.attr("viewBox", [0, 0, width, height])
.attr("style", "max-width: 100%; height: auto; font: 11px sans-serif;")
const root = svg.append("g").attr("transform", `translate(${marginLeft}, ${marginTop})`)
// Row backgrounds + labels
root.selectAll("g.row")
.data(KIND_ORDER)
.join("g")
.attr("class", "row")
.call(g => {
g.append("rect")
.attr("x", 0)
.attr("y", (_, i) => i * row_h)
.attr("width", innerWidth)
.attr("height", row_h)
.attr("fill", d => kind_color[d.kind] ?? "#999")
.attr("fill-opacity", 0.05)
g.append("text")
.attr("x", -8)
.attr("y", (_, i) => i * row_h + row_h / 2 + 4)
.attr("text-anchor", "end")
.attr("font-size", 12)
.attr("font-weight", "bold")
.attr("fill", d => kind_color[d.kind] ?? "#999")
.text(d => d.label)
})
// Time axis
root.append("g")
.attr("transform", `translate(0, ${innerHeight})`)
.call(d3.axisBottom(xScale).ticks(d3.timeYear.every(2)).tickFormat(d3.timeFormat("%Y")))
.selectAll("text")
.attr("font-size", 11)
// Title
svg.append("text")
.attr("x", width / 2).attr("y", 20)
.attr("text-anchor", "middle")
.attr("font-size", 15)
.attr("font-weight", "bold")
.attr("fill", "#24292f")
.text(`Citation flow — ${nodes.length} papers, ${links.length} intra-catalog citations (citing → cited)`)
// Citation arcs. Cubic-Bezier curving upward (above the baseline).
function arcPath(d) {
const x1 = d.source.x, y1 = d.source.y
const x2 = d.target.x, y2 = d.target.y
const mx = (x1 + x2) / 2
// Lift the control points by a fraction of the horizontal span.
const lift = Math.min(180, Math.abs(x1 - x2) * 0.6)
const cy1 = Math.min(y1, y2) - lift
const cy2 = cy1
return `M ${x1} ${y1} C ${mx} ${cy1}, ${mx} ${cy2}, ${x2} ${y2}`
}
// Arrowhead markers. We need one per arc color (default/red/blue) because
// SVG markers inherit their fill from the marker itself, not the path's
// stroke. Refs get swapped on hover so the marker color matches the arc.
const defs = svg.append("defs")
const mk_marker = (id, color) => defs.append("marker")
.attr("id", id).attr("viewBox", "0 0 10 10").attr("refX", 9).attr("refY", 5)
.attr("markerWidth", 6).attr("markerHeight", 6).attr("orient", "auto-start-reverse")
.append("path").attr("d", "M0,0 L10,5 L0,10 z").attr("fill", color)
mk_marker("arr-default", "#94a3b8")
mk_marker("arr-red", "#cf222e")
mk_marker("arr-blue", "#1f6feb")
const arc_layer = root.append("g").attr("class", "arcs").attr("fill", "none")
const arc = arc_layer.selectAll("path")
.data(links)
.join("path")
.attr("d", arcPath)
.attr("stroke", "#94a3b8")
.attr("stroke-opacity", 0.18)
.attr("stroke-width", d => d.source_kind === "both" ? 0.9 : 0.6)
.attr("marker-end", "url(#arr-default)")
// Nodes
const nodeR = d => 3 + 5 * Math.sqrt(d.in_deg / max_in)
const node_layer = root.append("g").attr("class", "nodes")
const node = node_layer.selectAll("circle")
.data(nodes)
.join("circle")
.attr("cx", d => d.x)
.attr("cy", d => d.y)
.attr("r", nodeR)
.attr("fill", d => kind_color[d.kind] ?? "#999")
.attr("stroke", "#fff")
.attr("stroke-width", 0.8)
.attr("cursor", "pointer")
node.append("title").text(d =>
`${d.models || "(no model)"} — ${d.title}
year: ${d.year ?? ""}
kind: ${d.kind}
cited by ${d.in_deg} papers in the catalog
cites ${d.out_deg} papers in the catalog`
)
// Labels: only the top in-degree papers (one per row, so labels are legible).
const top_per_row = new Map()
for (const n of nodes.slice().sort((a, b) => b.in_deg - a.in_deg)) {
const r = n.kind
if (!top_per_row.has(r)) top_per_row.set(r, [])
if (top_per_row.get(r).length < 5 && n.in_deg >= 4) top_per_row.get(r).push(n)
}
const labelled = Array.from(top_per_row.values()).flat()
const label_layer = root.append("g").attr("class", "labels").attr("pointer-events", "none")
label_layer.selectAll("text")
.data(labelled)
.join("text")
.text(d => (d.models?.split(",")[0]?.trim()) || d.title.slice(0, 22))
.attr("x", d => d.x)
.attr("y", d => d.y - nodeR(d) - 4)
.attr("text-anchor", "middle")
.attr("font-size", 10)
.attr("font-weight", 600)
.attr("fill", "#1d2330")
// ===== Hover behaviour =====
// Precompute neighborhood maps.
const out_edges = new Map()
const in_edges = new Map()
for (const l of links) {
if (!out_edges.has(l.source.id)) out_edges.set(l.source.id, [])
if (!in_edges.has (l.target.id)) in_edges.set (l.target.id, [])
out_edges.get(l.source.id).push(l)
in_edges.get (l.target.id).push(l)
}
node.on("mouseenter", function (_, focus) {
const ins = new Set((in_edges.get(focus.id) ?? []).map(l => l.source.id))
const outs = new Set((out_edges.get(focus.id) ?? []).map(l => l.target.id))
arc
.attr("stroke", l => {
if (l.target.id === focus.id) return "#cf222e" // who cites this paper
if (l.source.id === focus.id) return "#1f6feb" // who this paper cites
return "#94a3b8"
})
.attr("marker-end", l => {
if (l.target.id === focus.id) return "url(#arr-red)"
if (l.source.id === focus.id) return "url(#arr-blue)"
return "url(#arr-default)"
})
.attr("stroke-opacity", l => (l.source.id === focus.id || l.target.id === focus.id) ? 0.85 : 0.04)
.attr("stroke-width", l => (l.source.id === focus.id || l.target.id === focus.id) ? 1.4 : 0.4)
node.attr("opacity", d => {
if (d.id === focus.id) return 1
if (ins.has(d.id) || outs.has(d.id)) return 1
return 0.22
})
})
node.on("mouseleave", function () {
arc.attr("stroke", "#94a3b8").attr("stroke-opacity", 0.18)
.attr("stroke-width", d => d.source_kind === "both" ? 0.9 : 0.6)
.attr("marker-end", "url(#arr-default)")
node.attr("opacity", 1)
})
return html`<div class="chart-wrap">
<button class="fs-btn" onclick="
const el = this.parentElement;
if (document.fullscreenElement) document.exitFullscreen();
else el.requestFullscreen();
">⛶ Fullscreen</button>
<div class="chart-scroll">${svg.node()}</div>
</div>`
}Where it appears
Most papers in this space appear first on bioRxiv or arXiv. Toggle preprints vs. peer-reviewed to see how the venue distribution shifts.
{
// Re-aggregate venues from the globally-filtered pubs so the chart respects
// Kind / Approach / Acquisition, then apply the section's preprint/peer-reviewed radio.
const subset = pubs_filtered.filter(p => venue_type === "all" || p.type === venue_type)
const grouped = Array.from(
d3.rollup(subset.filter(p => p.journal), v => v.length, p => p.journal),
([venue, papers]) => ({ venue, papers })
).sort((a, b) => d3.descending(a.papers, b.papers)).slice(0, 15)
return Plot.plot({
marginLeft: 180,
height: Math.max(260, grouped.length * 24),
x: { label: "Papers", grid: true },
y: { label: null },
marks: [
Plot.barX(grouped, {
x: "papers",
y: "venue",
fill: "#6f42c1",
sort: { y: "x", reverse: true },
tip: true
}),
Plot.ruleX([0])
]
})
}Venue citedness (open-data analog of the Impact Factor)
Two-year mean citedness from OpenAlex (summary_stats.2yr_mean_citedness). Methodologically equivalent to the Clarivate Impact Factor formula — mean citations in year t to articles published in years t-1 and t-2 — but computed over OpenAlex’s open Crossref-aggregated citation graph rather than the paywalled Web of Science one. Conferences and preprint servers are omitted (their non-rolling publication schedule makes the metric misleading). Built offline via build_journal_metrics.py; refresh annually.
{
// Decorate each venue row with the catalog's paper count for that venue,
// so users can spot where heavy curation overlaps with high-citedness venues.
const paper_counts = new Map()
for (const p of pubs_t) {
if (!p.journal) continue
paper_counts.set(p.journal, (paper_counts.get(p.journal) ?? 0) + 1)
}
const rows = venue_if_search.map(v => ({
...v,
papers_in_catalog: paper_counts.get(v.journal) ?? 0
}))
const table = Inputs.table(rows, {
columns: ["journal", "two_yr_citedness", "h_index", "papers_in_catalog", "year_collected"],
header: {
journal: "Venue",
two_yr_citedness: "IF₂ᵧᵣ (OpenAlex)",
h_index: "h-index",
papers_in_catalog: "Papers in catalog",
year_collected: "Year collected"
},
format: {
two_yr_citedness: c => c == null ? "" : c.toFixed(2),
h_index: v => v == null ? "" : String(v),
year_collected: y => y == null ? "" : String(y)
},
sort: "two_yr_citedness",
reverse: true,
rows: 30,
width: { two_yr_citedness: 130, h_index: 90, papers_in_catalog: 140, year_collected: 130 }
})
return html`<div class="chart-wrap">
<button class="fs-btn" onclick="
const el = this.parentElement;
if (document.fullscreenElement) document.exitFullscreen();
else el.requestFullscreen();
">⛶ Fullscreen</button>
<div class="chart-scroll">${table}</div>
</div>`
}Publication lifecycle
How a method goes from arXiv / bioRxiv preprint to a peer-reviewed publication. For each algorithm, every preprint is paired greedily with the earliest following peer-reviewed publication (peer-reviewed / ML-conference / thesis all count as “post-preprint”). The Status column then tells you whether each row is paired (lifecycle complete), preprint-only (still in flight), or peer-reviewed-only (published without a preprint we have on file).
lifecycle_rows = {
// Group publications by (method name, version). A pub without a model link
// is bucketed under a per-pub synthetic key so it can't accidentally pair
// with another stand-alone pub. Versioned publications (currently Casanovo
// v1 / v2 / v5) are bucketed by `${method}|${version}` so each version
// release pairs independently rather than collapsing into a single Casanovo
// row.
const by_group = new Map()
for (const p of pubs_filtered) {
const method = (p.models ?? "").split(",")[0]?.trim() || `_pub_${p.id}`
const key = p.version ? `${method}|${p.version}` : method
if (!by_group.has(key)) by_group.set(key, { method, version: p.version || null, pubs: [] })
by_group.get(key).pubs.push(p)
}
const POST_PREPRINT = new Set(["peer-reviewed", "ML conference", "thesis"])
const rows = []
for (const { method, version, pubs } of by_group.values()) {
// Sort each method's pubs by date.
const sorted = pubs.slice().sort((a, b) => (a.date ?? 0) - (b.date ?? 0))
const preprints = sorted.filter(p => p.type === "preprint")
const peers = sorted.filter(p => POST_PREPRINT.has(p.type))
// Greedy: each preprint claims the earliest unclaimed later peer-reviewed pub.
const claimed = new Set()
for (const pp of preprints) {
const target = peers.find(p => !claimed.has(p.id) && (p.date ?? 0) >= (pp.date ?? 0))
if (target) {
claimed.add(target.id)
const gap_days = ((target.date ?? 0) - (pp.date ?? 0)) / 86400000
const gap_months = gap_days / 30.44
rows.push({
method,
version,
status: "paired",
preprint_date: pp.date,
preprint_title: pp.title,
preprint_url: pp.url || (pp.doi ? `https://doi.org/${pp.doi}` : null),
peer_date: target.date,
peer_title: target.title,
peer_url: target.url || (target.doi ? `https://doi.org/${target.doi}` : null),
gap_months: Math.round(gap_months * 10) / 10
})
} else {
rows.push({
method,
version,
status: "preprint-only",
preprint_date: pp.date,
preprint_title: pp.title,
preprint_url: pp.url || (pp.doi ? `https://doi.org/${pp.doi}` : null),
peer_date: null,
peer_title: null,
peer_url: null,
gap_months: null
})
}
}
for (const p of peers) {
if (claimed.has(p.id)) continue
rows.push({
method,
version,
status: "peer-reviewed-only",
preprint_date: null,
preprint_title: null,
preprint_url: null,
peer_date: p.date,
peer_title: p.title,
peer_url: p.url || (p.doi ? `https://doi.org/${p.doi}` : null),
gap_months: null
})
}
}
// Order: paired first (by gap, shortest → longest), then preprint-only by
// preprint date (oldest first = candidates for follow-up), then
// peer-reviewed-only by year.
const status_rank = { "paired": 0, "preprint-only": 1, "peer-reviewed-only": 2 }
return rows.sort((a, b) => {
const s = status_rank[a.status] - status_rank[b.status]
if (s !== 0) return s
if (a.status === "paired") return a.gap_months - b.gap_months
return (a.preprint_date ?? a.peer_date ?? 0) - (b.preprint_date ?? b.peer_date ?? 0)
})
}{
const table = Inputs.table(lifecycle_search, {
columns: ["method", "version", "status", "preprint_date", "peer_date", "gap_months"],
header: {
method: "Method",
version: "Version",
status: "Status",
preprint_date: "Preprint",
peer_date: "Peer-reviewed",
gap_months: "Gap (months)"
},
format: {
preprint_date: (d, i) => {
const row = lifecycle_search[i]
if (!d) return ""
const label = d.toISOString().slice(0, 7)
return row.preprint_url
? htl.html`<a href="${row.preprint_url}" target="_blank" rel="noopener" title="${row.preprint_title}">${label}</a>`
: label
},
peer_date: (d, i) => {
const row = lifecycle_search[i]
if (!d) return ""
const label = d.toISOString().slice(0, 7)
return row.peer_url
? htl.html`<a href="${row.peer_url}" target="_blank" rel="noopener" title="${row.peer_title}">${label}</a>`
: label
},
gap_months: g => g == null ? "" : `${g.toFixed(1)} mo`
},
rows: 25,
width: { method: 200, version: 70, status: 160, preprint_date: 110, peer_date: 110, gap_months: 110 }
})
return html`<div class="chart-wrap">
<button class="fs-btn" onclick="
const el = this.parentElement;
if (document.fullscreenElement) document.exitFullscreen();
else el.requestFullscreen();
">⛶ Fullscreen</button>
<div class="chart-scroll">${table}</div>
</div>`
}{
// Label = method + version (when versioned). Casanovo's three versions then
// get their own y-axis rows; everything else stays as just the method name.
const paired = lifecycle_rows
.filter(r => r.status === "paired")
.map(r => ({
...r,
label: r.version ? `${r.method} ${r.version}` : r.method
}))
if (!paired.length) {
return html`<p style="color:#57606a; font-style:italic;">No paired methods in the current filter.</p>`
}
return Plot.plot({
marginLeft: 200,
height: Math.max(220, paired.length * 20),
x: { label: "Preprint → peer-reviewed gap (months)", grid: true },
y: { label: null },
marks: [
Plot.barX(paired, {
x1: 0,
x2: "gap_months", // explicit x1/x2 bypasses Plot's auto stackX transform
y: "label",
fill: "#1f6feb",
sort: { y: "x2", reverse: true },
tip: true
}),
Plot.ruleX([0])
]
})
}Browse all papers
viewof selected_pubs = {
// Decorate each row with the venue's OpenAlex 2-year citedness (or null).
// Also rewrite the comma-separated `models` string to append known aliases
// (e.g. π-HelixNovo → "π-HelixNovo (aka PandaNovo)") so renamed methods stay
// searchable by their historical name in the table's filter box.
const aliases_by_model = new Map(
algorithms_t.filter(a => a.aliases).map(a => [a.model, a.aliases])
)
const annotate_models = s => (s ?? "")
.split(",").map(x => x.trim()).filter(Boolean)
.map(m => aliases_by_model.has(m) ? `${m} (aka ${aliases_by_model.get(m)})` : m)
.join(", ")
const search_with_if = search.map(r => ({
...r,
models: annotate_models(r.models),
citedness: journal_impact_by_name.get(r.journal)?.two_yr_citedness ?? null
}))
const table = Inputs.table(search_with_if, {
columns: ["year", "models", "version", "kind", "is_dl", "acquisition", "title", "authors", "journal", "citedness", "type", "repo"],
header: {
year: "Year", models: "Method(s)", version: "Ver.", kind: "Kind", is_dl: "DL?", acquisition: "Acq.",
title: "Title", authors: "Authors", journal: "Venue", citedness: "IF₂ᵧᵣ", type: "Type", repo: "Code"
},
format: {
year: y => y == null ? "" : String(y),
is_dl: v => v === 1 || v === true ? "DL" : (v === 0 || v === false ? "classical" : ""),
citedness: c => c == null ? "" : c.toFixed(1),
title: (t, i) => {
const row = search_with_if[i]
const url = row?.url || (row?.doi ? `https://doi.org/${row.doi}` : null)
return url ? htl.html`<a href="${url}" target="_blank" rel="noopener">${t}</a>` : t
},
authors: a => a && a.length > 80 ? a.slice(0, 80) + "…" : a,
repo: r => {
if (!r) return ""
// The repository column holds either a single URL or two whitespace-separated URLs
// (a few RNovA-style entries). Render up to three short link chips.
const urls = String(r).split(/\s+/).filter(s => /^https?:\/\//.test(s)).slice(0, 3)
if (!urls.length) return ""
return htl.html`${urls.map(u => htl.html`<a href="${u}" target="_blank" rel="noopener" title="${u}" style="margin-right:4px">↗</a>`)}`
}
},
sort: "year",
reverse: true,
rows: 25,
width: { year: 60, type: 100, models: 130, version: 60, kind: 130, is_dl: 70, acquisition: 70, citedness: 70, repo: 60 }
})
// Reflect the inner Inputs.table's value/input on the fullscreen wrapper so
// `viewof selected_pubs` exposes the rows the user has ticked — used by the
// BibTeX download button below.
const wrapper = html`<div class="chart-wrap">
<button class="fs-btn" onclick="
const el = this.parentElement;
if (document.fullscreenElement) document.exitFullscreen();
else el.requestFullscreen();
">⛶ Fullscreen</button>
<div class="chart-scroll">${table}</div>
</div>`
Object.defineProperty(wrapper, "value", { get: () => table.value })
table.addEventListener("input", e => wrapper.dispatchEvent(new CustomEvent("input", { bubbles: false })))
return wrapper
}{
// Format one publication as a BibTeX entry. Entry type picked from
// publication_type; key is a stable first-author-lastname + year + title-word slug.
const slug = s => (s ?? "").toString().toLowerCase().replace(/[^a-z0-9]+/g, "")
const escape = s => (s ?? "").toString().replace(/[{}\\]/g, "\\$&")
const entry_type_of = t => ({
"ML conference": "inproceedings",
"thesis": "phdthesis",
"preprint": "misc",
"commentary": "misc"
})[t] ?? "article"
const to_bibtex = p => {
const author_list = (p.authors ?? "").split(",").map(s => s.trim()).filter(Boolean)
const first_last = author_list[0]?.split(/\s+/).pop() ?? "anon"
const title_word = (p.title ?? "ref").split(/\s+/).find(w => w.length > 3) ?? "ref"
const key = `${slug(first_last)}${p.year ?? ""}${slug(title_word)}`
const fields = []
if (p.title) fields.push(` title = {${escape(p.title)}}`)
if (author_list.length) fields.push(` author = {${author_list.map(escape).join(" and ")}}`)
if (p.year) fields.push(` year = {${p.year}}`)
if (p.journal) fields.push(` journal = {${escape(p.journal)}}`)
if (p.doi) fields.push(` doi = {${escape(p.doi)}}`)
if (p.url) fields.push(` url = {${escape(p.url)}}`)
if (p.type === "preprint") fields.push(` note = {preprint}`)
return `@${entry_type_of(p.type)}{${key},\n${fields.join(",\n")}\n}`
}
const handle_download = () => {
if (!selected_pubs.length) return
const text = selected_pubs.map(to_bibtex).join("\n\n") + "\n"
const blob = new Blob([text], { type: "application/x-bibtex;charset=utf-8" })
const a = document.createElement("a")
a.href = URL.createObjectURL(blob)
a.download = `de-novo-papers-${new Date().toISOString().slice(0,10)}.bib`
document.body.appendChild(a); a.click(); a.remove()
URL.revokeObjectURL(a.href)
}
const disabled = selected_pubs.length === 0
const btn = html`<button class="download-btn" ?disabled=${disabled}>
⬇ Download ${selected_pubs.length || "—"} selected as BibTeX
</button>`
btn.disabled = disabled
btn.onclick = handle_download
return html`<div style="margin: 0.5rem 0 1rem; display: flex; gap: 0.75rem; align-items: center;">
${btn}
<span style="color:#57606a; font-size: 0.9em;">Tick rows in the table above to enable.</span>
</div>`
}Browse all authors
Aggregated from the currently-filtered set of papers. Searching is case-insensitive across every column (name, affiliation, country, methods).
authors_table_rows = {
// Roll up authors across the filtered pubs, tagging each with the
// model names and kinds they touched in that subset.
const detail = new Map(author_details_t.map(d => [d.name, d]))
const counts = new Map() // name → { papers, models:Set, kinds:Set }
for (const p of pubs_filtered) {
const ms = (p.models ?? "").split(", ").filter(Boolean)
for (const a of (p.authors ?? "").split(", ").filter(Boolean)) {
const slot = counts.get(a) ?? { papers: 0, models: new Set(), kinds: new Set() }
slot.papers += 1
for (const m of ms) slot.models.add(m)
if (p.kind) slot.kinds.add(p.kind)
counts.set(a, slot)
}
}
return Array.from(counts, ([name, v]) => ({
name,
papers: v.papers,
methods: Array.from(v.models).sort().join(", "),
kinds: Array.from(v.kinds).sort().join(", "),
affiliations: detail.get(name)?.affiliations ?? "",
countries: detail.get(name)?.countries ?? ""
})).sort((a, b) => b.papers - a.papers || a.name.localeCompare(b.name))
}{
const table = Inputs.table(author_search, {
columns: ["name", "papers", "methods", "kinds", "affiliations", "countries"],
header: {
name: "Author", papers: "Papers", methods: "Method(s)",
kinds: "Kind(s)", affiliations: "Affiliation(s)", countries: "Country / countries"
},
format: {
papers: n => n == null ? "" : String(n),
affiliations: s => !s ? "" : (s.length > 80 ? s.slice(0, 80) + "…" : s),
methods: s => !s ? "" : (s.length > 60 ? s.slice(0, 60) + "…" : s)
},
sort: "papers",
reverse: true,
rows: 25,
width: { name: 180, papers: 70, methods: 220, kinds: 130, affiliations: 320, countries: 120 }
})
return html`<div class="chart-wrap">
<button class="fs-btn" onclick="
const el = this.parentElement;
if (document.fullscreenElement) document.exitFullscreen();
else el.requestFullscreen();
">⛶ Fullscreen</button>
<div class="chart-scroll">${table}</div>
</div>`
}Contributing
Easiest path: open a GitHub issue with a link to the paper (DOI / arXiv / bioRxiv / OpenReview / …) and I’ll wire it into the database. Corrections are equally welcome — wrong author lists, missing affiliations, mis-classified kind / DL / acquisition, broken hyperlinks, anything that looks off.
Advanced: edit the database directly
The site is generated from denovo.db (SQLite, the source of truth). If you’re comfortable with SQL:
Edit
denovo.dbwith any SQLite tool (sqlite3CLI, DB Browser for SQLite, DataGrip, …). A new paper typically needs rows inpublication,publication_author, andpublication_algorithm; a new model also needs a row inalgorithm(setkind,is_deep_learning,acquisition_mode). Affiliations cascade throughcountry → city → affiliationand link to authors viaauthor_affiliation.Regenerate the human-readable SQL dump so the diff is reviewable:
Open a PR with both
denovo.dbanddenovo.sql. The GitHub Action rebuilds the site and publishes togh-pageson merge — typically live within ~3 minutes.
This page is a comprehensive map of de novo peptide sequencing — algorithms, post-processors, downstream applications, and adjacent tools, deep-learning and classical alike. Source data and code: GitHub — rebuilt automatically on every push to main.