Skip to content

Instantly share code, notes, and snippets.

@dchud
Last active August 29, 2015 14:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dchud/4333a23cda0a42e38e19 to your computer and use it in GitHub Desktop.
Save dchud/4333a23cda0a42e38e19 to your computer and use it in GitHub Desktop.
animated Anscombe's Quartet regression diagnostics

What makes the Anscombe's Quartet of datasets useful, as wikipedia explains, is their near-equivalent summary stats: the x and y sets share the same mean, sample variance, correlation and simple linear regression model. It's instructive as a clear example of what to watch out for when developing simple linear regressions, and the issues each dataset highlights come clear in the different diagnostic plots.

The animation allows visual tracking of each data point through the different diagnostic views, drawing out the key differences in each dataset.

For a more complete writeup see http://data.onebiglibrary.net/2014/11/12/animated-anscombe-quartet-regression-diagnostics/.

{
"a1": {
"y": [
8.04,
6.95,
7.58,
8.81,
8.33,
9.96,
7.24,
4.26,
10.84,
4.82,
5.68
],
"x": [
10,
8,
13,
9,
11,
14,
6,
4,
12,
7,
5
],
"cooks": [
6.13978807921968e-05,
0.000104246723218346,
0.489209275774335,
0.0616369989508526,
0.00159934187639652,
0.000382899511122252,
0.126756484751495,
0.122699896340297,
0.27902959337588,
0.154341222372027,
0.00426801142667734
],
"quantile": [
0.0332439747060959,
-0.0433179064352171,
-1.63771133202482,
1.11588166845639,
-0.145839534724648,
-0.0352587610519202,
1.05644547125458,
-0.631170568720647,
1.56742628526857,
-1.43266807544596,
0.152968778717564
],
"error": [
0.038999999999999,
-0.0508181818181821,
-1.92127272727273,
1.30909090909091,
-0.171090909090909,
-0.0413636363636357,
1.23936363636364,
-0.740454545454545,
1.83881818181818,
-1.68072727272727,
0.179454545454545
]
},
"a3": {
"y": [
7.46,
6.77,
12.74,
7.11,
7.81,
8.84,
6.08,
5.39,
8.15,
6.42,
5.73
],
"x": [
10,
8,
13,
9,
11,
14,
6,
4,
12,
7,
5
],
"cooks": [
0.0117646225857928,
0.00214148127405174,
1.39284945025107,
0.00547313537024746,
0.0259838693465045,
0.300570810724506,
0.000517641076720796,
0.033817333563048,
0.0595359332877321,
0.000354629303376175,
0.00694780839315189
],
"quantile": [
-0.460177364224132,
-0.196333040858973,
2.76338948784212,
-0.332518257120012,
-0.587836471328246,
-0.987866010954444,
0.0675112825061848,
0.331355605871341,
-0.724021687589286,
-0.0686739337548555,
0.195170389610302
],
"error": [
-0.539727272727276,
-0.230272727272728,
3.24109090909091,
-0.389999999999999,
-0.689454545454545,
-1.15863636363636,
0.0791818181818186,
0.388636363636364,
-0.849181818181817,
-0.0805454545454541,
0.228909090909091
]
},
"a2": {
"y": [
9.14,
8.14,
8.74,
8.77,
9.26,
8.1,
6.13,
3.1,
9.13,
7.26,
4.74
],
"x": [
10,
8,
13,
9,
11,
14,
6,
4,
12,
7,
5
],
"cooks": [
0.0523253282572325,
0.0523253282572324,
0.0766572464822465,
0.0578706499704367,
0.0314518390187984,
0.807869303784104,
0.00137383643085486,
0.807869303784103,
0.00137383643085488,
0.0314518390187984,
0.0766572464822465
],
"quantile": [
0.970492611321788,
0.970492611321788,
-0.648285966222136,
1.08125114557479,
0.646736895813002,
-1.61955311274849,
0.109983999048439,
-1.61955311274849,
0.109983999048439,
0.646736895813003,
-0.648285966222136
],
"error": [
1.13909090909091,
1.13909090909091,
-0.760909090909091,
1.26909090909091,
0.759090909090909,
-1.90090909090909,
0.129090909090909,
-1.90090909090909,
0.12909090909091,
0.759090909090909,
-0.760909090909091
]
},
"a4": {
"y": [
6.58,
5.76,
7.71,
8.84,
8.47,
7.04,
5.25,
12.5,
5.56,
7.91,
6.89
],
"x": [
8,
8,
8,
8,
8,
8,
8,
19,
8,
8,
8
],
"cooks": [
0.00716516600865082,
0.0622594999563802,
0.0203214426368309,
0.13671794558337,
0.087237991239013,
6.14881291527244e-05,
0.123946525621549,
"NaN",
0.0839440709475179,
0.0334033352034457,
0.000498090229645427
],
"quantile": [
-0.35912809435592,
-1.05861749428906,
0.604802420186094,
1.5687329347281,
1.25310966890462,
0.0332683982895034,
-1.49366577961333,
1.07620459712723e-18,
-1.22922466500446,
0.775409590901494,
-0.094686979747047
],
"error": [
-0.421000000000003,
-1.241,
0.709,
1.839,
1.469,
0.0390000000000005,
-1.751,
1.11022302462516e-16,
-1.441,
0.909000000000001,
-0.111
]
}
}
<html>
<head>
<title>animating Anscombe's Quartet regression diagnostics</title>
<style>
.axis path,
.axis line {
fill: none;
stroke: black;
shape-rendering: crispEdges;
}
.axis text {
font-family: sans-serif;
font-size: 11px;
}
.label {
font-family: sans-serif;
font-variant: small-caps;
font-weight: normal;
font-size: x-large;
}
</style>
<link href="//maxcdn.bootstrapcdn.com/bootstrap/3.2.0/css/bootstrap.css" rel="stylesheet">
<script src="//maxcdn.bootstrapcdn.com/bootstrap/3.2.0/js/bootstrap.min.js"></script>
<script src="http://d3js.org/d3.v3.min.js" charset="utf-8"></script>
</head>
<body>
<div class="container-fluid">
<div class="row">
<div id="a1" class="col-xs-6"></div>
<div id="a2" class="col-xs-6"></div>
</div>
<div class="row">
<div id="a3" class="col-xs-6"></div>
<div id="a4" class="col-xs-6"></div>
</div>
</div>
<script>
// Each set in Anscombe's Quartet has the same summary numbers
// Better to use a javascript stats lib to calculate all this
// inside the chart; oh well, a shortcut for now
var xmean = 9;
var ymean = 7.5;
var xsd = 11;
var ysd = 4.1245; // fudging slightly three digits down
var slope = 0.5;
var intercept = 3.0;
var qnorm = [-1.383, -0.967, -0.674, -0.431, -0.210, 0.0, 0.210, 0.431,
0.674, 0.967, 1.383];
// colorbrewer "spectral" 11
var colors = ["#9e0142", "#d53e4f", "#f46d43", "#fdae61", "#fee08b",
"#ffffbf", "#e6f598", "#abdda4", "#66c2a5", "#3288bd", "#5e4fa2"];
var color_scale = d3.scale.ordinal()
.domain([0, 10])
.range(colors);
function expected(index) {
return (slope * index) + intercept;
};
function regcycle() {
var width = 400;
var height = 400;
var padding = 30;
var buffer = 1.1;
var duration = 2000;
var delay = 2000;
function my(sel) {
// generate a unique id for the named anchors
var data = [];
// this seems wrong
var seldata = sel.data()[0];
// reshape the data
for (i=0; i < seldata.x.length; i++) {
var obs = {
x: seldata.x[i],
y: seldata.y[i],
residual: seldata.error[i],
cooks: seldata.cooks[i],
quantile: seldata.quantile[i],
};
data.push(obs);
};
var uid = Math.round(Math.random() * 1024);
var min_x = d3.min(data, function(d) { return d.x; }) - 1;
var max_x = d3.max(data, function(d) { return d.x; }) + 1;
var min_y = d3.min(data, function(d) { return d.y; }) - 1;
var max_y = d3.max(data, function(d) { return d.y; }) + 1;
var max_residual = d3.max(data,
function(d) { return Math.abs(d.residual); });
var max_cooks = d3.max(data, function(d) { return d.cooks; })
var max_quantile = d3.max(data,
function(d) { return Math.abs(d.quantile); });
var max_qnorm = d3.max(qnorm);
// if the Cook's values are all low, lower the threshold so
// we can still discern individual values
if (max_cooks >= 0.5) {
if (max_cooks <= 1.1) {
max_cooks = 1.1;
};
};
// check for NaN values, set a high value if present
if (seldata.cooks.some(isNaN)) {
max_cooks = 2;
};
var svg = sel.append("svg")
.attr("width", width)
.attr("height", height);
// how much of the setup should be outside of the specific
// functions? it's repeating a lot for this first one...
// x and y scales, axes, for the basic fit plot
var x = d3.scale.linear()
.domain([min_x, max_x])
.range([padding, width - padding]);
var x_axis = d3.svg.axis()
.orient("bottom")
.scale(x);
var y = d3.scale.linear()
.domain([min_y, max_y])
.range([height - padding, padding]);
var y_axis = d3.svg.axis()
.orient("left")
.scale(y);
// sel contains general data/info like the regression line
svg.append("line")
.attr("id", "line" + uid)
.attr("x1", x(min_x))
.attr("y1", y(expected(min_x)))
.attr("x2", x(max_x))
.attr("y2", y(expected(max_x)))
.attr("stroke-width", 2)
.attr("stroke", "steelblue");
// g binds to the data; this feels like an unneeded two-step
// when sel is already bound too, perhaps a mistake?
var g = svg.selectAll("g")
.data(data)
.enter().append("g")
.attr("class", "object");
// styling elements should be in css, not here
g.each(function(d, i) {
var o = d3.select(this);
o.attr("class", "observation");
o.append("line")
.attr("x1", x(d.x))
.attr("y1", y(d.y))
.attr("x2", x(d.x))
.attr("y2", y(expected(d.y)))
.attr("class", "residual-bar")
.attr("stroke-width", 0)
.attr("stroke", "gray");
o.append("circle")
.attr("r", 5) // hard-coded!
.attr("cx", x(d.x))
.attr("cy", y(d.y))
.attr("class", "data-point")
.attr("stroke", "black")
.attr("fill", color_scale(i));
});
// establish initial axes
svg.append("g")
.attr("id", "x_axis" + uid)
.attr("class", "axis")
.attr("transform", "translate(0, " + (height - padding) + ")")
.call(x_axis);
svg.append("g")
.attr("id", "y_axis" + uid)
.attr("class", "axis")
.attr("transform", "translate(" + padding + ", 0)")
.call(y_axis);
// initial label
svg.append("text")
.attr("id", "label" + uid)
.attr("class", "label")
.attr("x", 40) // hard-coded!
.attr("y", 40) // hard-coded!
.text("model fit");
setTimeout(residual, delay);
// should these be inside this function or one level up?
// does it matter?
function fit() {
// reset scales/axes for fit plot
x = d3.scale.linear()
.domain([min_x, max_x])
.range([padding, width - padding]);
x_axis = d3.svg.axis()
.orient("bottom")
.scale(x);
y = d3.scale.linear()
.domain([min_y, max_y])
.range([height - padding, padding]);
y_axis = d3.svg.axis()
.orient("left")
.scale(y);
svg.select("#x_axis" + uid).transition()
.duration(duration)
.call(x_axis);
svg.select("#y_axis" + uid).transition()
.duration(duration)
.call(y_axis);
label = svg.select("#label" + uid).transition()
.duration(duration)
.text("fit model");
line = svg.select("#line" + uid).transition()
.duration(duration)
.attr("x1", x(min_x))
.attr("y1", y(expected(min_x)))
.attr("x2", x(max_x))
.attr("y2", y(expected(max_x)));
var c = svg.selectAll(".observation");
c.each(function(d, i) {
var o = d3.select(this);
o.select(".residual-bar").transition()
.duration(duration)
.attr("x1", x(d.x))
.attr("y1", y(d.y))
.attr("x2", x(d.x))
.attr("y2", y(expected(i)))
.attr("stroke-width", 0);
o.select(".data-point").transition()
.duration(duration)
.attr("cx", x(d.x))
.attr("cy", y(d.y));
});
setTimeout(residual, delay + duration);
};
function residual() {
// reset y scale/axis
y = d3.scale.linear()
.domain([-max_residual, max_residual])
.range([height - padding, padding]);
y_axis = d3.svg.axis()
.orient("left")
.scale(y);
svg.select("#y_axis" + uid).transition()
.duration(duration)
.call(y_axis);
label = svg.select("#label" + uid).transition()
.duration(duration)
.text("residuals");
line = svg.select("#line" + uid).transition()
.duration(duration)
.attr("y1", y(0))
.attr("y2", y(0));
var c = svg.selectAll(".observation");
c.each(function(d, i) {
var o = d3.select(this);
o.select(".data-point").transition()
.duration(duration)
.attr("cy", y(d.residual));
o.select(".residual-bar").transition()
.delay(duration)
.attr("x1", x(d.x))
.attr("y1", y(d.residual))
.attr("x2", x(d.x))
.attr("y2", y(0))
.attr("stroke-width", 3); // style hard-coded
});
setTimeout(cooks, delay + duration);
};
function cooks() {
// reset scale / axis for cooks, x in order, not by value
x = d3.scale.linear()
.domain([0, data.length])
.range([padding, width - padding]);
x_axis = d3.svg.axis()
.orient("bottom")
.scale(x);
svg.select("#x_axis" + uid).transition()
.duration(duration)
.call(x_axis);
y = d3.scale.linear()
.domain([0, max_cooks])
.range([height - padding, padding]);
y_axis= d3.svg.axis()
.orient("left")
.scale(y);
svg.select("#y_axis" + uid).transition()
.duration(duration)
.call(y_axis);
label = svg.select("#label" + uid).transition()
.duration(duration)
.text("cook's distance");
line = svg.select("#line" + uid).transition()
.duration(duration)
.attr("x1", x(0))
.attr("y1", y(1))
.attr("x2", x(data.length))
.attr("y2", y(1));
var c = svg.selectAll(".observation");
c.each(function(d, i) {
var o = d3.select(this);
o.select(".data-point").transition()
.duration(duration)
.attr("cx", x(i + 1))
.attr("cy", y(isNaN(d.cooks) ? 50 : d.cooks));
o.select(".residual-bar").transition()
.duration(duration)
.attr("x1", x(i + 1))
.attr("y1", y(isNaN(d.cooks) ? 50 : d.cooks))
.attr("x2", x(i + 1))
.attr("y2", y(0));
});
setTimeout(qq, delay + duration);
};
function qq() {
// reset x scale/axis to normal quantiles
x = d3.scale.linear()
.domain([-max_qnorm * buffer, max_qnorm * buffer])
.range([padding, width - padding]);
x_axis = d3.svg.axis()
.orient("bottom")
.scale(x);
svg.select("#x_axis" + uid).transition()
.duration(duration)
.call(x_axis);
// reset y scale/axis to observed quantiles
y = d3.scale.linear()
.domain([-max_quantile * buffer, max_quantile * buffer])
.range([height - padding, padding]);
y_axis= d3.svg.axis()
.orient("left")
.scale(y);
svg.select("#y_axis" + uid).transition()
.duration(duration)
.call(y_axis);
label = svg.select("#label" + uid).transition()
.duration(duration)
.text("q-q normal vs. observed");
line = svg.select("#line" + uid).transition()
.duration(duration)
.attr("y1", y(-max_quantile))
.attr("y2", y(max_quantile));
// sort the data to align Q-Q
var quantiles = data.map(function(d) { return d.quantile; });
var sorted = quantiles.sort(function(a, b) { return a - b; });
var c = svg.selectAll(".observation");
c.each(function(d, i) {
var o = d3.select(this);
o.select(".data-point").transition()
.duration(duration)
.attr("cx", x(qnorm[i]))
.attr("cy", y(sorted[i]));
o.select(".residual-bar").transition()
.attr("stroke-width", 0);
});
setTimeout(fit, delay + duration);
};
};
// add accessors here some other time :)
return my;
};
// init the four charts
var a1_cycle = regcycle();
var a2_cycle = regcycle();
var a3_cycle = regcycle();
var a4_cycle = regcycle();
// grab data, bind to charts, and render
d3.json("anscombe.json", function(data) {
d3.select("#a1")
.datum(data.a1)
.call(a1_cycle);
d3.select("#a2")
.datum(data.a2)
.call(a2_cycle);
d3.select("#a3")
.datum(data.a3)
.call(a3_cycle);
d3.select("#a4")
.datum(data.a4)
.call(a4_cycle);
});
</script>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment