{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# First steps with DataFrames.jl\n", "\n", "### Bogumił Kamiński\n", "\n", "In this notebook we will reproduce the classical Anscombe's quartert plot.\n", "\n", "Our objective is to produce a figure similar to this one (the plot is taken from https://upload.wikimedia.org/wikipedia/commons/e/ec/Anscombe%27s_quartet_3.svg):" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "![Anscombe's quartet](https://upload.wikimedia.org/wikipedia/commons/e/ec/Anscombe%27s_quartet_3.svg)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We start with loading of the required packages" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "using DataFrames" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "using Statistics" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "using PyPlot" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "using GLM" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This is a matrix in which we store 8 columns representing Anscombe's quartet data" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "11×8 Array{Float64,2}:\n", " 10.0 8.04 10.0 9.14 10.0 7.46 8.0 6.58\n", " 8.0 6.95 8.0 8.14 8.0 6.77 8.0 5.76\n", " 13.0 7.58 13.0 8.74 13.0 12.74 8.0 7.71\n", " 9.0 8.81 9.0 8.77 9.0 7.11 8.0 8.84\n", " 11.0 8.33 11.0 9.26 11.0 7.81 8.0 8.47\n", " 14.0 9.96 14.0 8.1 14.0 8.84 8.0 7.04\n", " 6.0 7.24 6.0 6.13 6.0 6.08 8.0 5.25\n", " 4.0 4.26 4.0 3.1 4.0 5.39 19.0 12.5\n", " 12.0 10.84 12.0 9.13 12.0 8.15 8.0 5.56\n", " 7.0 4.82 7.0 7.26 7.0 6.42 8.0 7.91\n", " 5.0 5.68 5.0 4.74 5.0 5.73 8.0 6.89" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "aq = [10.0 8.04 10.0 9.14 10.0 7.46 8.0 6.58\n", " 8.0 6.95 8.0 8.14 8.0 6.77 8.0 5.76\n", " 13.0 7.58 13.0 8.74 13.0 12.74 8.0 7.71\n", " 9.0 8.81 9.0 8.77 9.0 7.11 8.0 8.84\n", " 11.0 8.33 11.0 9.26 11.0 7.81 8.0 8.47\n", " 14.0 9.96 14.0 8.1 14.0 8.84 8.0 7.04\n", " 6.0 7.24 6.0 6.13 6.0 6.08 8.0 5.25\n", " 4.0 4.26 4.0 3.1 4.0 5.39 19.0 12.50 \n", " 12.0 10.84 12.0 9.13 12.0 8.15 8.0 5.56\n", " 7.0 4.82 7.0 7.26 7.0 6.42 8.0 7.91\n", " 5.0 5.68 5.0 4.74 5.0 5.73 8.0 6.89]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can simply convert a maxtrx to a `DataFrame` by calling its constructor" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

11 rows × 8 columns

x1x2x3x4x5x6x7x8
Float64Float64Float64Float64Float64Float64Float64Float64
110.08.0410.09.1410.07.468.06.58
28.06.958.08.148.06.778.05.76
313.07.5813.08.7413.012.748.07.71
49.08.819.08.779.07.118.08.84
511.08.3311.09.2611.07.818.08.47
614.09.9614.08.114.08.848.07.04
76.07.246.06.136.06.088.05.25
84.04.264.03.14.05.3919.012.5
912.010.8412.09.1312.08.158.05.56
107.04.827.07.267.06.428.07.91
115.05.685.04.745.05.738.06.89
" ], "text/latex": [ "\\begin{tabular}{r|cccccccc}\n", "\t& x1 & x2 & x3 & x4 & x5 & x6 & x7 & x8\\\\\n", "\t\\hline\n", "\t& Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64\\\\\n", "\t\\hline\n", "\t1 & 10.0 & 8.04 & 10.0 & 9.14 & 10.0 & 7.46 & 8.0 & 6.58 \\\\\n", "\t2 & 8.0 & 6.95 & 8.0 & 8.14 & 8.0 & 6.77 & 8.0 & 5.76 \\\\\n", "\t3 & 13.0 & 7.58 & 13.0 & 8.74 & 13.0 & 12.74 & 8.0 & 7.71 \\\\\n", "\t4 & 9.0 & 8.81 & 9.0 & 8.77 & 9.0 & 7.11 & 8.0 & 8.84 \\\\\n", "\t5 & 11.0 & 8.33 & 11.0 & 9.26 & 11.0 & 7.81 & 8.0 & 8.47 \\\\\n", "\t6 & 14.0 & 9.96 & 14.0 & 8.1 & 14.0 & 8.84 & 8.0 & 7.04 \\\\\n", "\t7 & 6.0 & 7.24 & 6.0 & 6.13 & 6.0 & 6.08 & 8.0 & 5.25 \\\\\n", "\t8 & 4.0 & 4.26 & 4.0 & 3.1 & 4.0 & 5.39 & 19.0 & 12.5 \\\\\n", "\t9 & 12.0 & 10.84 & 12.0 & 9.13 & 12.0 & 8.15 & 8.0 & 5.56 \\\\\n", "\t10 & 7.0 & 4.82 & 7.0 & 7.26 & 7.0 & 6.42 & 8.0 & 7.91 \\\\\n", "\t11 & 5.0 & 5.68 & 5.0 & 4.74 & 5.0 & 5.73 & 8.0 & 6.89 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "11×8 DataFrame. Omitted printing of 1 columns\n", "│ Row │ x1 │ x2 │ x3 │ x4 │ x5 │ x6 │ x7 │\n", "│ │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │\n", "├─────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┤\n", "│ 1 │ 10.0 │ 8.04 │ 10.0 │ 9.14 │ 10.0 │ 7.46 │ 8.0 │\n", "│ 2 │ 8.0 │ 6.95 │ 8.0 │ 8.14 │ 8.0 │ 6.77 │ 8.0 │\n", "│ 3 │ 13.0 │ 7.58 │ 13.0 │ 8.74 │ 13.0 │ 12.74 │ 8.0 │\n", "│ 4 │ 9.0 │ 8.81 │ 9.0 │ 8.77 │ 9.0 │ 7.11 │ 8.0 │\n", "│ 5 │ 11.0 │ 8.33 │ 11.0 │ 9.26 │ 11.0 │ 7.81 │ 8.0 │\n", "│ 6 │ 14.0 │ 9.96 │ 14.0 │ 8.1 │ 14.0 │ 8.84 │ 8.0 │\n", "│ 7 │ 6.0 │ 7.24 │ 6.0 │ 6.13 │ 6.0 │ 6.08 │ 8.0 │\n", "│ 8 │ 4.0 │ 4.26 │ 4.0 │ 3.1 │ 4.0 │ 5.39 │ 19.0 │\n", "│ 9 │ 12.0 │ 10.84 │ 12.0 │ 9.13 │ 12.0 │ 8.15 │ 8.0 │\n", "│ 10 │ 7.0 │ 4.82 │ 7.0 │ 7.26 │ 7.0 │ 6.42 │ 8.0 │\n", "│ 11 │ 5.0 │ 5.68 │ 5.0 │ 4.74 │ 5.0 │ 5.73 │ 8.0 │" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = DataFrame(aq)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note that the auto-generated column names are `x1`, `x2`, etc." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Next we replace automatically generated column names by proper ones." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "8-element Array{String,1}:\n", " \"x1\"\n", " \"y1\"\n", " \"x2\"\n", " \"y2\"\n", " \"x3\"\n", " \"y3\"\n", " \"x4\"\n", " \"y4\"" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "newnames = vec(string.([\"x\", \"y\"], [1 2 3 4]))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

11 rows × 8 columns

x1y1x2y2x3y3x4y4
Float64Float64Float64Float64Float64Float64Float64Float64
110.08.0410.09.1410.07.468.06.58
28.06.958.08.148.06.778.05.76
313.07.5813.08.7413.012.748.07.71
49.08.819.08.779.07.118.08.84
511.08.3311.09.2611.07.818.08.47
614.09.9614.08.114.08.848.07.04
76.07.246.06.136.06.088.05.25
84.04.264.03.14.05.3919.012.5
912.010.8412.09.1312.08.158.05.56
107.04.827.07.267.06.428.07.91
115.05.685.04.745.05.738.06.89
" ], "text/latex": [ "\\begin{tabular}{r|cccccccc}\n", "\t& x1 & y1 & x2 & y2 & x3 & y3 & x4 & y4\\\\\n", "\t\\hline\n", "\t& Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64\\\\\n", "\t\\hline\n", "\t1 & 10.0 & 8.04 & 10.0 & 9.14 & 10.0 & 7.46 & 8.0 & 6.58 \\\\\n", "\t2 & 8.0 & 6.95 & 8.0 & 8.14 & 8.0 & 6.77 & 8.0 & 5.76 \\\\\n", "\t3 & 13.0 & 7.58 & 13.0 & 8.74 & 13.0 & 12.74 & 8.0 & 7.71 \\\\\n", "\t4 & 9.0 & 8.81 & 9.0 & 8.77 & 9.0 & 7.11 & 8.0 & 8.84 \\\\\n", "\t5 & 11.0 & 8.33 & 11.0 & 9.26 & 11.0 & 7.81 & 8.0 & 8.47 \\\\\n", "\t6 & 14.0 & 9.96 & 14.0 & 8.1 & 14.0 & 8.84 & 8.0 & 7.04 \\\\\n", "\t7 & 6.0 & 7.24 & 6.0 & 6.13 & 6.0 & 6.08 & 8.0 & 5.25 \\\\\n", "\t8 & 4.0 & 4.26 & 4.0 & 3.1 & 4.0 & 5.39 & 19.0 & 12.5 \\\\\n", "\t9 & 12.0 & 10.84 & 12.0 & 9.13 & 12.0 & 8.15 & 8.0 & 5.56 \\\\\n", "\t10 & 7.0 & 4.82 & 7.0 & 7.26 & 7.0 & 6.42 & 8.0 & 7.91 \\\\\n", "\t11 & 5.0 & 5.68 & 5.0 & 4.74 & 5.0 & 5.73 & 8.0 & 6.89 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "11×8 DataFrame. Omitted printing of 1 columns\n", "│ Row │ x1 │ y1 │ x2 │ y2 │ x3 │ y3 │ x4 │\n", "│ │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │\n", "├─────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┤\n", "│ 1 │ 10.0 │ 8.04 │ 10.0 │ 9.14 │ 10.0 │ 7.46 │ 8.0 │\n", "│ 2 │ 8.0 │ 6.95 │ 8.0 │ 8.14 │ 8.0 │ 6.77 │ 8.0 │\n", "│ 3 │ 13.0 │ 7.58 │ 13.0 │ 8.74 │ 13.0 │ 12.74 │ 8.0 │\n", "│ 4 │ 9.0 │ 8.81 │ 9.0 │ 8.77 │ 9.0 │ 7.11 │ 8.0 │\n", "│ 5 │ 11.0 │ 8.33 │ 11.0 │ 9.26 │ 11.0 │ 7.81 │ 8.0 │\n", "│ 6 │ 14.0 │ 9.96 │ 14.0 │ 8.1 │ 14.0 │ 8.84 │ 8.0 │\n", "│ 7 │ 6.0 │ 7.24 │ 6.0 │ 6.13 │ 6.0 │ 6.08 │ 8.0 │\n", "│ 8 │ 4.0 │ 4.26 │ 4.0 │ 3.1 │ 4.0 │ 5.39 │ 19.0 │\n", "│ 9 │ 12.0 │ 10.84 │ 12.0 │ 9.13 │ 12.0 │ 8.15 │ 8.0 │\n", "│ 10 │ 7.0 │ 4.82 │ 7.0 │ 7.26 │ 7.0 │ 6.42 │ 8.0 │\n", "│ 11 │ 5.0 │ 5.68 │ 5.0 │ 4.74 │ 5.0 │ 5.73 │ 8.0 │" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rename!(df, newnames)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We could have also assigned the names to columns at the moment of data frame creation like this:" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

11 rows × 8 columns

x1y1x2y2x3y3x4y4
Float64Float64Float64Float64Float64Float64Float64Float64
110.08.0410.09.1410.07.468.06.58
28.06.958.08.148.06.778.05.76
313.07.5813.08.7413.012.748.07.71
49.08.819.08.779.07.118.08.84
511.08.3311.09.2611.07.818.08.47
614.09.9614.08.114.08.848.07.04
76.07.246.06.136.06.088.05.25
84.04.264.03.14.05.3919.012.5
912.010.8412.09.1312.08.158.05.56
107.04.827.07.267.06.428.07.91
115.05.685.04.745.05.738.06.89
" ], "text/latex": [ "\\begin{tabular}{r|cccccccc}\n", "\t& x1 & y1 & x2 & y2 & x3 & y3 & x4 & y4\\\\\n", "\t\\hline\n", "\t& Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64\\\\\n", "\t\\hline\n", "\t1 & 10.0 & 8.04 & 10.0 & 9.14 & 10.0 & 7.46 & 8.0 & 6.58 \\\\\n", "\t2 & 8.0 & 6.95 & 8.0 & 8.14 & 8.0 & 6.77 & 8.0 & 5.76 \\\\\n", "\t3 & 13.0 & 7.58 & 13.0 & 8.74 & 13.0 & 12.74 & 8.0 & 7.71 \\\\\n", "\t4 & 9.0 & 8.81 & 9.0 & 8.77 & 9.0 & 7.11 & 8.0 & 8.84 \\\\\n", "\t5 & 11.0 & 8.33 & 11.0 & 9.26 & 11.0 & 7.81 & 8.0 & 8.47 \\\\\n", "\t6 & 14.0 & 9.96 & 14.0 & 8.1 & 14.0 & 8.84 & 8.0 & 7.04 \\\\\n", "\t7 & 6.0 & 7.24 & 6.0 & 6.13 & 6.0 & 6.08 & 8.0 & 5.25 \\\\\n", "\t8 & 4.0 & 4.26 & 4.0 & 3.1 & 4.0 & 5.39 & 19.0 & 12.5 \\\\\n", "\t9 & 12.0 & 10.84 & 12.0 & 9.13 & 12.0 & 8.15 & 8.0 & 5.56 \\\\\n", "\t10 & 7.0 & 4.82 & 7.0 & 7.26 & 7.0 & 6.42 & 8.0 & 7.91 \\\\\n", "\t11 & 5.0 & 5.68 & 5.0 & 4.74 & 5.0 & 5.73 & 8.0 & 6.89 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "11×8 DataFrame. Omitted printing of 1 columns\n", "│ Row │ x1 │ y1 │ x2 │ y2 │ x3 │ y3 │ x4 │\n", "│ │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │\n", "├─────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┤\n", "│ 1 │ 10.0 │ 8.04 │ 10.0 │ 9.14 │ 10.0 │ 7.46 │ 8.0 │\n", "│ 2 │ 8.0 │ 6.95 │ 8.0 │ 8.14 │ 8.0 │ 6.77 │ 8.0 │\n", "│ 3 │ 13.0 │ 7.58 │ 13.0 │ 8.74 │ 13.0 │ 12.74 │ 8.0 │\n", "│ 4 │ 9.0 │ 8.81 │ 9.0 │ 8.77 │ 9.0 │ 7.11 │ 8.0 │\n", "│ 5 │ 11.0 │ 8.33 │ 11.0 │ 9.26 │ 11.0 │ 7.81 │ 8.0 │\n", "│ 6 │ 14.0 │ 9.96 │ 14.0 │ 8.1 │ 14.0 │ 8.84 │ 8.0 │\n", "│ 7 │ 6.0 │ 7.24 │ 6.0 │ 6.13 │ 6.0 │ 6.08 │ 8.0 │\n", "│ 8 │ 4.0 │ 4.26 │ 4.0 │ 3.1 │ 4.0 │ 5.39 │ 19.0 │\n", "│ 9 │ 12.0 │ 10.84 │ 12.0 │ 9.13 │ 12.0 │ 8.15 │ 8.0 │\n", "│ 10 │ 7.0 │ 4.82 │ 7.0 │ 7.26 │ 7.0 │ 6.42 │ 8.0 │\n", "│ 11 │ 5.0 │ 5.68 │ 5.0 │ 4.74 │ 5.0 │ 5.73 │ 8.0 │" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "DataFrame(aq, [:x1, :y1, :x2, :y2, :x3, :y3, :x4, :y4])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "> You might have noticed that in the first example we used a string (e.g. `\"x1\"`) as column name and in the second one we used a `Symbol` (e.g. `:x1`). This was intentional. DataFrames.jl allows you to use either of them for column indexing.\n", "\n", "To see the above rule at work let us extract the second column `:y1` from this data frame. Here are several options how you can do it:" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "11-element Array{Float64,1}:\n", " 8.04\n", " 6.95\n", " 7.58\n", " 8.81\n", " 8.33\n", " 9.96\n", " 7.24\n", " 4.26\n", " 10.84\n", " 4.82\n", " 5.68" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.y1" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "11-element Array{Float64,1}:\n", " 8.04\n", " 6.95\n", " 7.58\n", " 8.81\n", " 8.33\n", " 9.96\n", " 7.24\n", " 4.26\n", " 10.84\n", " 4.82\n", " 5.68" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.\"y1\"" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "11-element Array{Float64,1}:\n", " 8.04\n", " 6.95\n", " 7.58\n", " 8.81\n", " 8.33\n", " 9.96\n", " 7.24\n", " 4.26\n", " 10.84\n", " 4.82\n", " 5.68" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[:, :y1]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "11-element Array{Float64,1}:\n", " 8.04\n", " 6.95\n", " 7.58\n", " 8.81\n", " 8.33\n", " 9.96\n", " 7.24\n", " 4.26\n", " 10.84\n", " 4.82\n", " 5.68" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[:, \"y1\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Assume that now we want to reorder columns of the data frame `df` in-place by first grouping the \"x\"-columns and then \"y\" columns.\n", "\n", "This can be easiliy achieved with the `select!` function.\n", "\n", "Note that in column selection we can in particular use regular expressions like `r\"x\"` (matching all columns that have `\"x\"` in their name) and `:` which matches all columns (in this case only columns not having `\"x\"` in ther name are left)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

11 rows × 8 columns

x1x2x3x4y1y2y3y4
Float64Float64Float64Float64Float64Float64Float64Float64
110.010.010.08.08.049.147.466.58
28.08.08.08.06.958.146.775.76
313.013.013.08.07.588.7412.747.71
49.09.09.08.08.818.777.118.84
511.011.011.08.08.339.267.818.47
614.014.014.08.09.968.18.847.04
76.06.06.08.07.246.136.085.25
84.04.04.019.04.263.15.3912.5
912.012.012.08.010.849.138.155.56
107.07.07.08.04.827.266.427.91
115.05.05.08.05.684.745.736.89
" ], "text/latex": [ "\\begin{tabular}{r|cccccccc}\n", "\t& x1 & x2 & x3 & x4 & y1 & y2 & y3 & y4\\\\\n", "\t\\hline\n", "\t& Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64\\\\\n", "\t\\hline\n", "\t1 & 10.0 & 10.0 & 10.0 & 8.0 & 8.04 & 9.14 & 7.46 & 6.58 \\\\\n", "\t2 & 8.0 & 8.0 & 8.0 & 8.0 & 6.95 & 8.14 & 6.77 & 5.76 \\\\\n", "\t3 & 13.0 & 13.0 & 13.0 & 8.0 & 7.58 & 8.74 & 12.74 & 7.71 \\\\\n", "\t4 & 9.0 & 9.0 & 9.0 & 8.0 & 8.81 & 8.77 & 7.11 & 8.84 \\\\\n", "\t5 & 11.0 & 11.0 & 11.0 & 8.0 & 8.33 & 9.26 & 7.81 & 8.47 \\\\\n", "\t6 & 14.0 & 14.0 & 14.0 & 8.0 & 9.96 & 8.1 & 8.84 & 7.04 \\\\\n", "\t7 & 6.0 & 6.0 & 6.0 & 8.0 & 7.24 & 6.13 & 6.08 & 5.25 \\\\\n", "\t8 & 4.0 & 4.0 & 4.0 & 19.0 & 4.26 & 3.1 & 5.39 & 12.5 \\\\\n", "\t9 & 12.0 & 12.0 & 12.0 & 8.0 & 10.84 & 9.13 & 8.15 & 5.56 \\\\\n", "\t10 & 7.0 & 7.0 & 7.0 & 8.0 & 4.82 & 7.26 & 6.42 & 7.91 \\\\\n", "\t11 & 5.0 & 5.0 & 5.0 & 8.0 & 5.68 & 4.74 & 5.73 & 6.89 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "11×8 DataFrame. Omitted printing of 1 columns\n", "│ Row │ x1 │ x2 │ x3 │ x4 │ y1 │ y2 │ y3 │\n", "│ │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │\n", "├─────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┤\n", "│ 1 │ 10.0 │ 10.0 │ 10.0 │ 8.0 │ 8.04 │ 9.14 │ 7.46 │\n", "│ 2 │ 8.0 │ 8.0 │ 8.0 │ 8.0 │ 6.95 │ 8.14 │ 6.77 │\n", "│ 3 │ 13.0 │ 13.0 │ 13.0 │ 8.0 │ 7.58 │ 8.74 │ 12.74 │\n", "│ 4 │ 9.0 │ 9.0 │ 9.0 │ 8.0 │ 8.81 │ 8.77 │ 7.11 │\n", "│ 5 │ 11.0 │ 11.0 │ 11.0 │ 8.0 │ 8.33 │ 9.26 │ 7.81 │\n", "│ 6 │ 14.0 │ 14.0 │ 14.0 │ 8.0 │ 9.96 │ 8.1 │ 8.84 │\n", "│ 7 │ 6.0 │ 6.0 │ 6.0 │ 8.0 │ 7.24 │ 6.13 │ 6.08 │\n", "│ 8 │ 4.0 │ 4.0 │ 4.0 │ 19.0 │ 4.26 │ 3.1 │ 5.39 │\n", "│ 9 │ 12.0 │ 12.0 │ 12.0 │ 8.0 │ 10.84 │ 9.13 │ 8.15 │\n", "│ 10 │ 7.0 │ 7.0 │ 7.0 │ 8.0 │ 4.82 │ 7.26 │ 6.42 │\n", "│ 11 │ 5.0 │ 5.0 │ 5.0 │ 8.0 │ 5.68 │ 4.74 │ 5.73 │" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "select!(df, r\"x\", :)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note that we could have used `select` instead of `select!` function to create a new data frame (instead of mutating the data frame in question in-place)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "An interesting feature of Anscombe's quartet is that its variables have the same mean and variance.\n", "\n", "We can easily check this using the `describe` function." ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

8 rows × 3 columns

variablemeanstd
SymbolFloat64Float64
1x19.03.31662
2x29.03.31662
3x39.03.31662
4x49.03.31662
5y17.500912.03157
6y27.500912.03166
7y37.52.03042
8y47.500912.03058
" ], "text/latex": [ "\\begin{tabular}{r|ccc}\n", "\t& variable & mean & std\\\\\n", "\t\\hline\n", "\t& Symbol & Float64 & Float64\\\\\n", "\t\\hline\n", "\t1 & x1 & 9.0 & 3.31662 \\\\\n", "\t2 & x2 & 9.0 & 3.31662 \\\\\n", "\t3 & x3 & 9.0 & 3.31662 \\\\\n", "\t4 & x4 & 9.0 & 3.31662 \\\\\n", "\t5 & y1 & 7.50091 & 2.03157 \\\\\n", "\t6 & y2 & 7.50091 & 2.03166 \\\\\n", "\t7 & y3 & 7.5 & 2.03042 \\\\\n", "\t8 & y4 & 7.50091 & 2.03058 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "8×3 DataFrame\n", "│ Row │ variable │ mean │ std │\n", "│ │ \u001b[90mSymbol\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │\n", "├─────┼──────────┼─────────┼─────────┤\n", "│ 1 │ x1 │ 9.0 │ 3.31662 │\n", "│ 2 │ x2 │ 9.0 │ 3.31662 │\n", "│ 3 │ x3 │ 9.0 │ 3.31662 │\n", "│ 4 │ x4 │ 9.0 │ 3.31662 │\n", "│ 5 │ y1 │ 7.50091 │ 2.03157 │\n", "│ 6 │ y2 │ 7.50091 │ 2.03166 │\n", "│ 7 │ y3 │ 7.5 │ 2.03042 │\n", "│ 8 │ y4 │ 7.50091 │ 2.03058 │" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "describe(df, :mean => mean, :std => std)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Just to practice the string/`Symbol` duality let us write the same using strings for column names:" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

8 rows × 3 columns

variablemeanstd
SymbolFloat64Float64
1x19.03.31662
2x29.03.31662
3x39.03.31662
4x49.03.31662
5y17.500912.03157
6y27.500912.03166
7y37.52.03042
8y47.500912.03058
" ], "text/latex": [ "\\begin{tabular}{r|ccc}\n", "\t& variable & mean & std\\\\\n", "\t\\hline\n", "\t& Symbol & Float64 & Float64\\\\\n", "\t\\hline\n", "\t1 & x1 & 9.0 & 3.31662 \\\\\n", "\t2 & x2 & 9.0 & 3.31662 \\\\\n", "\t3 & x3 & 9.0 & 3.31662 \\\\\n", "\t4 & x4 & 9.0 & 3.31662 \\\\\n", "\t5 & y1 & 7.50091 & 2.03157 \\\\\n", "\t6 & y2 & 7.50091 & 2.03166 \\\\\n", "\t7 & y3 & 7.5 & 2.03042 \\\\\n", "\t8 & y4 & 7.50091 & 2.03058 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "8×3 DataFrame\n", "│ Row │ variable │ mean │ std │\n", "│ │ \u001b[90mSymbol\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │\n", "├─────┼──────────┼─────────┼─────────┤\n", "│ 1 │ x1 │ 9.0 │ 3.31662 │\n", "│ 2 │ x2 │ 9.0 │ 3.31662 │\n", "│ 3 │ x3 │ 9.0 │ 3.31662 │\n", "│ 4 │ x4 │ 9.0 │ 3.31662 │\n", "│ 5 │ y1 │ 7.50091 │ 2.03157 │\n", "│ 6 │ y2 │ 7.50091 │ 2.03166 │\n", "│ 7 │ y3 │ 7.5 │ 2.03042 │\n", "│ 8 │ y4 │ 7.50091 │ 2.03058 │" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "describe(df, \"mean\" => mean, \"std\" => std)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now let us add a new column `id` to the data frame that will just index its rows form 1 to number of rows." ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

11 rows × 9 columns

x1x2x3x4y1y2y3y4id
Float64Float64Float64Float64Float64Float64Float64Float64Int64
110.010.010.08.08.049.147.466.581
28.08.08.08.06.958.146.775.762
313.013.013.08.07.588.7412.747.713
49.09.09.08.08.818.777.118.844
511.011.011.08.08.339.267.818.475
614.014.014.08.09.968.18.847.046
76.06.06.08.07.246.136.085.257
84.04.04.019.04.263.15.3912.58
912.012.012.08.010.849.138.155.569
107.07.07.08.04.827.266.427.9110
115.05.05.08.05.684.745.736.8911
" ], "text/latex": [ "\\begin{tabular}{r|ccccccccc}\n", "\t& x1 & x2 & x3 & x4 & y1 & y2 & y3 & y4 & id\\\\\n", "\t\\hline\n", "\t& Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Int64\\\\\n", "\t\\hline\n", "\t1 & 10.0 & 10.0 & 10.0 & 8.0 & 8.04 & 9.14 & 7.46 & 6.58 & 1 \\\\\n", "\t2 & 8.0 & 8.0 & 8.0 & 8.0 & 6.95 & 8.14 & 6.77 & 5.76 & 2 \\\\\n", "\t3 & 13.0 & 13.0 & 13.0 & 8.0 & 7.58 & 8.74 & 12.74 & 7.71 & 3 \\\\\n", "\t4 & 9.0 & 9.0 & 9.0 & 8.0 & 8.81 & 8.77 & 7.11 & 8.84 & 4 \\\\\n", "\t5 & 11.0 & 11.0 & 11.0 & 8.0 & 8.33 & 9.26 & 7.81 & 8.47 & 5 \\\\\n", "\t6 & 14.0 & 14.0 & 14.0 & 8.0 & 9.96 & 8.1 & 8.84 & 7.04 & 6 \\\\\n", "\t7 & 6.0 & 6.0 & 6.0 & 8.0 & 7.24 & 6.13 & 6.08 & 5.25 & 7 \\\\\n", "\t8 & 4.0 & 4.0 & 4.0 & 19.0 & 4.26 & 3.1 & 5.39 & 12.5 & 8 \\\\\n", "\t9 & 12.0 & 12.0 & 12.0 & 8.0 & 10.84 & 9.13 & 8.15 & 5.56 & 9 \\\\\n", "\t10 & 7.0 & 7.0 & 7.0 & 8.0 & 4.82 & 7.26 & 6.42 & 7.91 & 10 \\\\\n", "\t11 & 5.0 & 5.0 & 5.0 & 8.0 & 5.68 & 4.74 & 5.73 & 6.89 & 11 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "11×9 DataFrame. Omitted printing of 2 columns\n", "│ Row │ x1 │ x2 │ x3 │ x4 │ y1 │ y2 │ y3 │\n", "│ │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │\n", "├─────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┤\n", "│ 1 │ 10.0 │ 10.0 │ 10.0 │ 8.0 │ 8.04 │ 9.14 │ 7.46 │\n", "│ 2 │ 8.0 │ 8.0 │ 8.0 │ 8.0 │ 6.95 │ 8.14 │ 6.77 │\n", "│ 3 │ 13.0 │ 13.0 │ 13.0 │ 8.0 │ 7.58 │ 8.74 │ 12.74 │\n", "│ 4 │ 9.0 │ 9.0 │ 9.0 │ 8.0 │ 8.81 │ 8.77 │ 7.11 │\n", "│ 5 │ 11.0 │ 11.0 │ 11.0 │ 8.0 │ 8.33 │ 9.26 │ 7.81 │\n", "│ 6 │ 14.0 │ 14.0 │ 14.0 │ 8.0 │ 9.96 │ 8.1 │ 8.84 │\n", "│ 7 │ 6.0 │ 6.0 │ 6.0 │ 8.0 │ 7.24 │ 6.13 │ 6.08 │\n", "│ 8 │ 4.0 │ 4.0 │ 4.0 │ 19.0 │ 4.26 │ 3.1 │ 5.39 │\n", "│ 9 │ 12.0 │ 12.0 │ 12.0 │ 8.0 │ 10.84 │ 9.13 │ 8.15 │\n", "│ 10 │ 7.0 │ 7.0 │ 7.0 │ 8.0 │ 4.82 │ 7.26 │ 6.42 │\n", "│ 11 │ 5.0 │ 5.0 │ 5.0 │ 8.0 │ 5.68 │ 4.74 │ 5.73 │" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.id = 1:nrow(df) # also writing axes(df, 1) on the right hand side would work\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Similarly to `nrow` which gives us a number of rows in a data frame, one can use `ncol` function to get its number of columns." ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "9" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ncol(df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In order to practice what we have already learnt let us create a new data frame which will have `:id` coumn in front of the remaining columns." ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

11 rows × 9 columns

idx1x2x3x4y1y2y3y4
Int64Float64Float64Float64Float64Float64Float64Float64Float64
1110.010.010.08.08.049.147.466.58
228.08.08.08.06.958.146.775.76
3313.013.013.08.07.588.7412.747.71
449.09.09.08.08.818.777.118.84
5511.011.011.08.08.339.267.818.47
6614.014.014.08.09.968.18.847.04
776.06.06.08.07.246.136.085.25
884.04.04.019.04.263.15.3912.5
9912.012.012.08.010.849.138.155.56
10107.07.07.08.04.827.266.427.91
11115.05.05.08.05.684.745.736.89
" ], "text/latex": [ "\\begin{tabular}{r|ccccccccc}\n", "\t& id & x1 & x2 & x3 & x4 & y1 & y2 & y3 & y4\\\\\n", "\t\\hline\n", "\t& Int64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64\\\\\n", "\t\\hline\n", "\t1 & 1 & 10.0 & 10.0 & 10.0 & 8.0 & 8.04 & 9.14 & 7.46 & 6.58 \\\\\n", "\t2 & 2 & 8.0 & 8.0 & 8.0 & 8.0 & 6.95 & 8.14 & 6.77 & 5.76 \\\\\n", "\t3 & 3 & 13.0 & 13.0 & 13.0 & 8.0 & 7.58 & 8.74 & 12.74 & 7.71 \\\\\n", "\t4 & 4 & 9.0 & 9.0 & 9.0 & 8.0 & 8.81 & 8.77 & 7.11 & 8.84 \\\\\n", "\t5 & 5 & 11.0 & 11.0 & 11.0 & 8.0 & 8.33 & 9.26 & 7.81 & 8.47 \\\\\n", "\t6 & 6 & 14.0 & 14.0 & 14.0 & 8.0 & 9.96 & 8.1 & 8.84 & 7.04 \\\\\n", "\t7 & 7 & 6.0 & 6.0 & 6.0 & 8.0 & 7.24 & 6.13 & 6.08 & 5.25 \\\\\n", "\t8 & 8 & 4.0 & 4.0 & 4.0 & 19.0 & 4.26 & 3.1 & 5.39 & 12.5 \\\\\n", "\t9 & 9 & 12.0 & 12.0 & 12.0 & 8.0 & 10.84 & 9.13 & 8.15 & 5.56 \\\\\n", "\t10 & 10 & 7.0 & 7.0 & 7.0 & 8.0 & 4.82 & 7.26 & 6.42 & 7.91 \\\\\n", "\t11 & 11 & 5.0 & 5.0 & 5.0 & 8.0 & 5.68 & 4.74 & 5.73 & 6.89 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "11×9 DataFrame. Omitted printing of 2 columns\n", "│ Row │ id │ x1 │ x2 │ x3 │ x4 │ y1 │ y2 │\n", "│ │ \u001b[90mInt64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │\n", "├─────┼───────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┤\n", "│ 1 │ 1 │ 10.0 │ 10.0 │ 10.0 │ 8.0 │ 8.04 │ 9.14 │\n", "│ 2 │ 2 │ 8.0 │ 8.0 │ 8.0 │ 8.0 │ 6.95 │ 8.14 │\n", "│ 3 │ 3 │ 13.0 │ 13.0 │ 13.0 │ 8.0 │ 7.58 │ 8.74 │\n", "│ 4 │ 4 │ 9.0 │ 9.0 │ 9.0 │ 8.0 │ 8.81 │ 8.77 │\n", "│ 5 │ 5 │ 11.0 │ 11.0 │ 11.0 │ 8.0 │ 8.33 │ 9.26 │\n", "│ 6 │ 6 │ 14.0 │ 14.0 │ 14.0 │ 8.0 │ 9.96 │ 8.1 │\n", "│ 7 │ 7 │ 6.0 │ 6.0 │ 6.0 │ 8.0 │ 7.24 │ 6.13 │\n", "│ 8 │ 8 │ 4.0 │ 4.0 │ 4.0 │ 19.0 │ 4.26 │ 3.1 │\n", "│ 9 │ 9 │ 12.0 │ 12.0 │ 12.0 │ 8.0 │ 10.84 │ 9.13 │\n", "│ 10 │ 10 │ 7.0 │ 7.0 │ 7.0 │ 8.0 │ 4.82 │ 7.26 │\n", "│ 11 │ 11 │ 5.0 │ 5.0 │ 5.0 │ 8.0 │ 5.68 │ 4.74 │" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "select(df, \"id\", :)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note that this time `df` data frame was not changed, as `select` makes a copy." ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

11 rows × 9 columns

x1x2x3x4y1y2y3y4id
Float64Float64Float64Float64Float64Float64Float64Float64Int64
110.010.010.08.08.049.147.466.581
28.08.08.08.06.958.146.775.762
313.013.013.08.07.588.7412.747.713
49.09.09.08.08.818.777.118.844
511.011.011.08.08.339.267.818.475
614.014.014.08.09.968.18.847.046
76.06.06.08.07.246.136.085.257
84.04.04.019.04.263.15.3912.58
912.012.012.08.010.849.138.155.569
107.07.07.08.04.827.266.427.9110
115.05.05.08.05.684.745.736.8911
" ], "text/latex": [ "\\begin{tabular}{r|ccccccccc}\n", "\t& x1 & x2 & x3 & x4 & y1 & y2 & y3 & y4 & id\\\\\n", "\t\\hline\n", "\t& Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Int64\\\\\n", "\t\\hline\n", "\t1 & 10.0 & 10.0 & 10.0 & 8.0 & 8.04 & 9.14 & 7.46 & 6.58 & 1 \\\\\n", "\t2 & 8.0 & 8.0 & 8.0 & 8.0 & 6.95 & 8.14 & 6.77 & 5.76 & 2 \\\\\n", "\t3 & 13.0 & 13.0 & 13.0 & 8.0 & 7.58 & 8.74 & 12.74 & 7.71 & 3 \\\\\n", "\t4 & 9.0 & 9.0 & 9.0 & 8.0 & 8.81 & 8.77 & 7.11 & 8.84 & 4 \\\\\n", "\t5 & 11.0 & 11.0 & 11.0 & 8.0 & 8.33 & 9.26 & 7.81 & 8.47 & 5 \\\\\n", "\t6 & 14.0 & 14.0 & 14.0 & 8.0 & 9.96 & 8.1 & 8.84 & 7.04 & 6 \\\\\n", "\t7 & 6.0 & 6.0 & 6.0 & 8.0 & 7.24 & 6.13 & 6.08 & 5.25 & 7 \\\\\n", "\t8 & 4.0 & 4.0 & 4.0 & 19.0 & 4.26 & 3.1 & 5.39 & 12.5 & 8 \\\\\n", "\t9 & 12.0 & 12.0 & 12.0 & 8.0 & 10.84 & 9.13 & 8.15 & 5.56 & 9 \\\\\n", "\t10 & 7.0 & 7.0 & 7.0 & 8.0 & 4.82 & 7.26 & 6.42 & 7.91 & 10 \\\\\n", "\t11 & 5.0 & 5.0 & 5.0 & 8.0 & 5.68 & 4.74 & 5.73 & 6.89 & 11 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "11×9 DataFrame. Omitted printing of 2 columns\n", "│ Row │ x1 │ x2 │ x3 │ x4 │ y1 │ y2 │ y3 │\n", "│ │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │\n", "├─────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┤\n", "│ 1 │ 10.0 │ 10.0 │ 10.0 │ 8.0 │ 8.04 │ 9.14 │ 7.46 │\n", "│ 2 │ 8.0 │ 8.0 │ 8.0 │ 8.0 │ 6.95 │ 8.14 │ 6.77 │\n", "│ 3 │ 13.0 │ 13.0 │ 13.0 │ 8.0 │ 7.58 │ 8.74 │ 12.74 │\n", "│ 4 │ 9.0 │ 9.0 │ 9.0 │ 8.0 │ 8.81 │ 8.77 │ 7.11 │\n", "│ 5 │ 11.0 │ 11.0 │ 11.0 │ 8.0 │ 8.33 │ 9.26 │ 7.81 │\n", "│ 6 │ 14.0 │ 14.0 │ 14.0 │ 8.0 │ 9.96 │ 8.1 │ 8.84 │\n", "│ 7 │ 6.0 │ 6.0 │ 6.0 │ 8.0 │ 7.24 │ 6.13 │ 6.08 │\n", "│ 8 │ 4.0 │ 4.0 │ 4.0 │ 19.0 │ 4.26 │ 3.1 │ 5.39 │\n", "│ 9 │ 12.0 │ 12.0 │ 12.0 │ 8.0 │ 10.84 │ 9.13 │ 8.15 │\n", "│ 10 │ 7.0 │ 7.0 │ 7.0 │ 8.0 │ 4.82 │ 7.26 │ 6.42 │\n", "│ 11 │ 5.0 │ 5.0 │ 5.0 │ 8.0 │ 5.68 │ 4.74 │ 5.73 │" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "It is also easy to transform a data frame back to a matrix using the `Matrix` function." ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "11×9 Array{Float64,2}:\n", " 10.0 10.0 10.0 8.0 8.04 9.14 7.46 6.58 1.0\n", " 8.0 8.0 8.0 8.0 6.95 8.14 6.77 5.76 2.0\n", " 13.0 13.0 13.0 8.0 7.58 8.74 12.74 7.71 3.0\n", " 9.0 9.0 9.0 8.0 8.81 8.77 7.11 8.84 4.0\n", " 11.0 11.0 11.0 8.0 8.33 9.26 7.81 8.47 5.0\n", " 14.0 14.0 14.0 8.0 9.96 8.1 8.84 7.04 6.0\n", " 6.0 6.0 6.0 8.0 7.24 6.13 6.08 5.25 7.0\n", " 4.0 4.0 4.0 19.0 4.26 3.1 5.39 12.5 8.0\n", " 12.0 12.0 12.0 8.0 10.84 9.13 8.15 5.56 9.0\n", " 7.0 7.0 7.0 8.0 4.82 7.26 6.42 7.91 10.0\n", " 5.0 5.0 5.0 8.0 5.68 4.74 5.73 6.89 11.0" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Matrix(df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We will use this feature to determine a plotting range for our data." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " We take the extrema respectively of the \"x\" and \"y\" variables, add padding equal to 1, and finally collect them to vectors (without calling the `collect` our result would be a `Tuple`)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2-element Array{Float64,1}:\n", " 3.0\n", " 20.0" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "xlim = collect(extrema(Matrix(select(df, r\"x\"))) .+ (-1, 1))" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2-element Array{Float64,1}:\n", " 2.1\n", " 13.74" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ylim = collect(extrema(Matrix(select(df, r\"y\"))) .+ (-1, 1))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we are ready to produce the final plots.\n", "\n", "Note that the GLM.jl package is integrated with DataFrames.jl and in `lm` and `predict` functions we can use data frame objects to pass data.\n", "\n", "Note though that as GLM.jl in general supports other tabular types than just DataFrames.jl it is more strict and accepts only `Symbol`s as column names (this is required when we use the `term` function)." ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "Figure(PyObject
)" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "fig, axs = plt.subplots(2, 2)\n", "fig.tight_layout(pad=4.0)\n", "for i in 1:4\n", " x = Symbol(\"x\", i)\n", " y = Symbol(\"y\", i)\n", " model = lm(term(y)~term(x), df)\n", " axs[i].plot(xlim, predict(model, DataFrame(x => xlim)), color=\"orange\")\n", " axs[i].scatter(df[:, x], df[:, y])\n", " axs[i].set_xlim(xlim)\n", " axs[i].set_ylim(ylim)\n", " axs[i].set_xlabel(\"x$i\")\n", " axs[i].set_ylabel(\"y$i\")\n", " a, b = round.(coef(model), digits=2)\n", " c = round(100 * r2(model), digits=2)\n", " axs[i].set_title(string(\"R²=$c%, $y=$a+$b$x\"))\n", "end" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We note that in all cases the estimated models have exactly the same R² and estimated coefficients." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "It is worth to highlight several important features of DataFrames.jl package functionality that we used in the above example." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "First, it is easy to create a data frame from variables holding column names and values using `=>`. Here is one more example how this functionality can be used:" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

3 rows × 2 columns

var1var2
Int64Int64
114
225
336
" ], "text/latex": [ "\\begin{tabular}{r|cc}\n", "\t& var1 & var2\\\\\n", "\t\\hline\n", "\t& Int64 & Int64\\\\\n", "\t\\hline\n", "\t1 & 1 & 4 \\\\\n", "\t2 & 2 & 5 \\\\\n", "\t3 & 3 & 6 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "3×2 DataFrame\n", "│ Row │ var1 │ var2 │\n", "│ │ \u001b[90mInt64\u001b[39m │ \u001b[90mInt64\u001b[39m │\n", "├─────┼───────┼───────┤\n", "│ 1 │ 1 │ 4 │\n", "│ 2 │ 2 │ 5 │\n", "│ 3 │ 3 │ 6 │" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x = :var1\n", "y = :var2\n", "xc = 1:3\n", "yc = 4:6\n", "DataFrame(x => xc, y => yc)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note that the same effect can be achieved with passing keyword arguments directly to `DataFrame` constructor:" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

3 rows × 2 columns

var1var2
Int64Int64
114
225
336
" ], "text/latex": [ "\\begin{tabular}{r|cc}\n", "\t& var1 & var2\\\\\n", "\t\\hline\n", "\t& Int64 & Int64\\\\\n", "\t\\hline\n", "\t1 & 1 & 4 \\\\\n", "\t2 & 2 & 5 \\\\\n", "\t3 & 3 & 6 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "3×2 DataFrame\n", "│ Row │ var1 │ var2 │\n", "│ │ \u001b[90mInt64\u001b[39m │ \u001b[90mInt64\u001b[39m │\n", "├─────┼───────┼───────┤\n", "│ 1 │ 1 │ 4 │\n", "│ 2 │ 2 │ 5 │\n", "│ 3 │ 3 │ 6 │" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "DataFrame(var1=xc, var2=yc)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Another thing you might have noticed is the use of `df[:, x]` and `df[:, y]` indexing expressions to get columns from a data frame. Let us comment on the differences between `df.col` and `df[:, col]` syntaxes that we discussed above:" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "11-element Array{Float64,1}:\n", " 10.0\n", " 8.0\n", " 13.0\n", " 9.0\n", " 11.0\n", " 14.0\n", " 6.0\n", " 4.0\n", " 12.0\n", " 7.0\n", " 5.0" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# gives you a direct access to the column stored in `df` but x1 is a literal\n", "df.x1" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "11-element Array{Float64,1}:\n", " 10.0\n", " 8.0\n", " 13.0\n", " 9.0\n", " 11.0\n", " 14.0\n", " 6.0\n", " 4.0\n", " 12.0\n", " 7.0\n", " 5.0" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# now the column is copied, so this is the same as copy(df.x1)\n", "# in this case we could have used a variable instead of a literal for indexing\n", "df[:, :x1]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "So what if you want to select a column without copying it but want to use a variable holding its name. There is a special `!` row selector for this:" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ ":x1" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "n = :x1" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "11-element Array{Float64,1}:\n", " 10.0\n", " 8.0\n", " 13.0\n", " 9.0\n", " 11.0\n", " 14.0\n", " 6.0\n", " 4.0\n", " 12.0\n", " 7.0\n", " 5.0" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "v = df[!, n]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let us check that `!` does not copy:" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "true" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "v === df.x1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Indexing of a data frame is very flexible. All rules governing this functionality can be found at https://juliadata.github.io/DataFrames.jl/stable/lib/indexing/." ] } ], "metadata": { "@webio": { "lastCommId": null, "lastKernelId": null }, "kernelspec": { "display_name": "Julia 1.4.1", "language": "julia", "name": "julia-1.4" }, "language_info": { "file_extension": ".jl", "mimetype": "application/julia", "name": "julia", "version": "1.4.1" } }, "nbformat": 4, "nbformat_minor": 4 }