{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Working with groups of rows of a data frame\n", "\n", "### Bogumił Kamiński" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "using DataFrames" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "using CSV" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "using Statistics" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "using FreqTables" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "using Pipe" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This part of the tutorial assumes that you have run *3. Working with text files* part so that you have auto2.csv file in your working directory." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(15, 200)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ENV[\"LINES\"], ENV[\"COLUMNS\"] = 15, 200" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

392 rows × 10 columns

mpgcylindersdisplacementhorsepowerweightaccelerationyearoriginnamebrand
Float64Float64Float64Float64Float64Float64Float64Float64StringString
118.08.0307.0130.03504.012.070.01.0chevrolet chevelle malibuchevrolet
215.08.0350.0165.03693.011.570.01.0buick skylark 320buick
318.08.0318.0150.03436.011.070.01.0plymouth satelliteplymouth
416.08.0304.0150.03433.012.070.01.0amc rebel sstamc
517.08.0302.0140.03449.010.570.01.0ford torinoford
615.08.0429.0198.04341.010.070.01.0ford galaxie 500ford
714.08.0454.0220.04354.09.070.01.0chevrolet impalachevrolet
814.08.0440.0215.04312.08.570.01.0plymouth fury iiiplymouth
914.08.0455.0225.04425.010.070.01.0pontiac catalinapontiac
1015.08.0390.0190.03850.08.570.01.0amc ambassador dplamc
1115.08.0383.0170.03563.010.070.01.0dodge challenger sedodge
1214.08.0340.0160.03609.08.070.01.0plymouth 'cuda 340plymouth
1315.08.0400.0150.03761.09.570.01.0chevrolet monte carlochevrolet
1414.08.0455.0225.03086.010.070.01.0buick estate wagon (sw)buick
1524.04.0113.095.02372.015.070.03.0toyota corona mark iitoyota
" ], "text/latex": [ "\\begin{tabular}{r|cccccccccc}\n", "\t& mpg & cylinders & displacement & horsepower & weight & acceleration & year & origin & name & brand\\\\\n", "\t\\hline\n", "\t& Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & String & String\\\\\n", "\t\\hline\n", "\t1 & 18.0 & 8.0 & 307.0 & 130.0 & 3504.0 & 12.0 & 70.0 & 1.0 & chevrolet chevelle malibu & chevrolet \\\\\n", "\t2 & 15.0 & 8.0 & 350.0 & 165.0 & 3693.0 & 11.5 & 70.0 & 1.0 & buick skylark 320 & buick \\\\\n", "\t3 & 18.0 & 8.0 & 318.0 & 150.0 & 3436.0 & 11.0 & 70.0 & 1.0 & plymouth satellite & plymouth \\\\\n", "\t4 & 16.0 & 8.0 & 304.0 & 150.0 & 3433.0 & 12.0 & 70.0 & 1.0 & amc rebel sst & amc \\\\\n", "\t5 & 17.0 & 8.0 & 302.0 & 140.0 & 3449.0 & 10.5 & 70.0 & 1.0 & ford torino & ford \\\\\n", "\t6 & 15.0 & 8.0 & 429.0 & 198.0 & 4341.0 & 10.0 & 70.0 & 1.0 & ford galaxie 500 & ford \\\\\n", "\t7 & 14.0 & 8.0 & 454.0 & 220.0 & 4354.0 & 9.0 & 70.0 & 1.0 & chevrolet impala & chevrolet \\\\\n", "\t8 & 14.0 & 8.0 & 440.0 & 215.0 & 4312.0 & 8.5 & 70.0 & 1.0 & plymouth fury iii & plymouth \\\\\n", "\t9 & 14.0 & 8.0 & 455.0 & 225.0 & 4425.0 & 10.0 & 70.0 & 1.0 & pontiac catalina & pontiac \\\\\n", "\t10 & 15.0 & 8.0 & 390.0 & 190.0 & 3850.0 & 8.5 & 70.0 & 1.0 & amc ambassador dpl & amc \\\\\n", "\t11 & 15.0 & 8.0 & 383.0 & 170.0 & 3563.0 & 10.0 & 70.0 & 1.0 & dodge challenger se & dodge \\\\\n", "\t12 & 14.0 & 8.0 & 340.0 & 160.0 & 3609.0 & 8.0 & 70.0 & 1.0 & plymouth 'cuda 340 & plymouth \\\\\n", "\t13 & 15.0 & 8.0 & 400.0 & 150.0 & 3761.0 & 9.5 & 70.0 & 1.0 & chevrolet monte carlo & chevrolet \\\\\n", "\t14 & 14.0 & 8.0 & 455.0 & 225.0 & 3086.0 & 10.0 & 70.0 & 1.0 & buick estate wagon (sw) & buick \\\\\n", "\t15 & 24.0 & 4.0 & 113.0 & 95.0 & 2372.0 & 15.0 & 70.0 & 3.0 & toyota corona mark ii & toyota \\\\\n", "\t$\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "392×10 DataFrame\n", "│ Row │ mpg │ cylinders │ displacement │ horsepower │ weight │ acceleration │ year │ origin │ name │ brand │\n", "│ │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mString\u001b[39m │ \u001b[90mString\u001b[39m │\n", "├─────┼─────────┼───────────┼──────────────┼────────────┼─────────┼──────────────┼─────────┼─────────┼───────────────────────────┼───────────┤\n", "│ 1 │ 18.0 │ 8.0 │ 307.0 │ 130.0 │ 3504.0 │ 12.0 │ 70.0 │ 1.0 │ chevrolet chevelle malibu │ chevrolet │\n", "│ 2 │ 15.0 │ 8.0 │ 350.0 │ 165.0 │ 3693.0 │ 11.5 │ 70.0 │ 1.0 │ buick skylark 320 │ buick │\n", "│ 3 │ 18.0 │ 8.0 │ 318.0 │ 150.0 │ 3436.0 │ 11.0 │ 70.0 │ 1.0 │ plymouth satellite │ plymouth │\n", "⋮\n", "│ 389 │ 44.0 │ 4.0 │ 97.0 │ 52.0 │ 2130.0 │ 24.6 │ 82.0 │ 2.0 │ vw pickup │ vw │\n", "│ 390 │ 32.0 │ 4.0 │ 135.0 │ 84.0 │ 2295.0 │ 11.6 │ 82.0 │ 1.0 │ dodge rampage │ dodge │\n", "│ 391 │ 28.0 │ 4.0 │ 120.0 │ 79.0 │ 2625.0 │ 18.6 │ 82.0 │ 1.0 │ ford ranger │ ford │\n", "│ 392 │ 31.0 │ 4.0 │ 119.0 │ 82.0 │ 2720.0 │ 19.4 │ 82.0 │ 1.0 │ chevy s-10 │ chevy │" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = CSV.File(\"auto2.csv\") |> DataFrame" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We want to group our data frame by `:brand` column:" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

GroupedDataFrame with 37 groups based on key: brand

First Group (43 rows): brand = \"chevrolet\"

mpgcylindersdisplacementhorsepowerweightaccelerationyearoriginnamebrand
Float64Float64Float64Float64Float64Float64Float64Float64StringString
118.08.0307.0130.03504.012.070.01.0chevrolet chevelle malibuchevrolet
214.08.0454.0220.04354.09.070.01.0chevrolet impalachevrolet
315.08.0400.0150.03761.09.570.01.0chevrolet monte carlochevrolet
428.04.0140.090.02264.015.571.01.0chevrolet vega 2300chevrolet
517.06.0250.0100.03329.015.571.01.0chevrolet chevelle malibuchevrolet
614.08.0350.0165.04209.012.071.01.0chevrolet impalachevrolet
722.04.0140.072.02408.019.071.01.0chevrolet vega (sw)chevrolet
820.04.0140.090.02408.019.572.01.0chevrolet vegachevrolet
913.08.0350.0165.04274.012.072.01.0chevrolet impalachevrolet
1013.08.0307.0130.04098.014.072.01.0chevrolet chevelle concours (sw)chevrolet
1113.08.0350.0145.03988.013.073.01.0chevrolet malibuchevrolet
1213.08.0400.0150.04464.012.073.01.0chevrolet caprice classicchevrolet
1316.06.0250.0100.03278.018.073.01.0chevrolet nova customchevrolet
1411.08.0400.0150.04997.014.073.01.0chevrolet impalachevrolet
1521.04.0140.072.02401.019.573.01.0chevrolet vegachevrolet

Last Group (1 row): brand = \"nissan\"

mpgcylindersdisplacementhorsepowerweightaccelerationyearoriginnamebrand
Float64Float64Float64Float64Float64Float64Float64Float64StringString
136.04.0120.088.02160.014.582.03.0nissan stanza xenissan
" ], "text/latex": [ "GroupedDataFrame with 37 groups based on key: brand\n", "\n", "First Group (43 rows): brand = \"chevrolet\"\n", "\n", "\\begin{tabular}{r|cccccccccc}\n", "\t& mpg & cylinders & displacement & horsepower & weight & acceleration & year & origin & name & brand\\\\\n", "\t\\hline\n", "\t& Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & String & String\\\\\n", "\t\\hline\n", "\t1 & 18.0 & 8.0 & 307.0 & 130.0 & 3504.0 & 12.0 & 70.0 & 1.0 & chevrolet chevelle malibu & chevrolet \\\\\n", "\t2 & 14.0 & 8.0 & 454.0 & 220.0 & 4354.0 & 9.0 & 70.0 & 1.0 & chevrolet impala & chevrolet \\\\\n", "\t3 & 15.0 & 8.0 & 400.0 & 150.0 & 3761.0 & 9.5 & 70.0 & 1.0 & chevrolet monte carlo & chevrolet \\\\\n", "\t4 & 28.0 & 4.0 & 140.0 & 90.0 & 2264.0 & 15.5 & 71.0 & 1.0 & chevrolet vega 2300 & chevrolet \\\\\n", "\t5 & 17.0 & 6.0 & 250.0 & 100.0 & 3329.0 & 15.5 & 71.0 & 1.0 & chevrolet chevelle malibu & chevrolet \\\\\n", "\t6 & 14.0 & 8.0 & 350.0 & 165.0 & 4209.0 & 12.0 & 71.0 & 1.0 & chevrolet impala & chevrolet \\\\\n", "\t7 & 22.0 & 4.0 & 140.0 & 72.0 & 2408.0 & 19.0 & 71.0 & 1.0 & chevrolet vega (sw) & chevrolet \\\\\n", "\t8 & 20.0 & 4.0 & 140.0 & 90.0 & 2408.0 & 19.5 & 72.0 & 1.0 & chevrolet vega & chevrolet \\\\\n", "\t9 & 13.0 & 8.0 & 350.0 & 165.0 & 4274.0 & 12.0 & 72.0 & 1.0 & chevrolet impala & chevrolet \\\\\n", "\t10 & 13.0 & 8.0 & 307.0 & 130.0 & 4098.0 & 14.0 & 72.0 & 1.0 & chevrolet chevelle concours (sw) & chevrolet \\\\\n", "\t11 & 13.0 & 8.0 & 350.0 & 145.0 & 3988.0 & 13.0 & 73.0 & 1.0 & chevrolet malibu & chevrolet \\\\\n", "\t12 & 13.0 & 8.0 & 400.0 & 150.0 & 4464.0 & 12.0 & 73.0 & 1.0 & chevrolet caprice classic & chevrolet \\\\\n", "\t13 & 16.0 & 6.0 & 250.0 & 100.0 & 3278.0 & 18.0 & 73.0 & 1.0 & chevrolet nova custom & chevrolet \\\\\n", "\t14 & 11.0 & 8.0 & 400.0 & 150.0 & 4997.0 & 14.0 & 73.0 & 1.0 & chevrolet impala & chevrolet \\\\\n", "\t15 & 21.0 & 4.0 & 140.0 & 72.0 & 2401.0 & 19.5 & 73.0 & 1.0 & chevrolet vega & chevrolet \\\\\n", "\t$\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ \\\\\n", "\\end{tabular}\n", "\n", "$\\dots$\n", "\n", "Last Group (1 row): brand = \"nissan\"\n", "\n", "\\begin{tabular}{r|cccccccccc}\n", "\t& mpg & cylinders & displacement & horsepower & weight & acceleration & year & origin & name & brand\\\\\n", "\t\\hline\n", "\t& Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & String & String\\\\\n", "\t\\hline\n", "\t1 & 36.0 & 4.0 & 120.0 & 88.0 & 2160.0 & 14.5 & 82.0 & 3.0 & nissan stanza xe & nissan \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "GroupedDataFrame with 37 groups based on key: brand\n", "First Group (43 rows): brand = \"chevrolet\"\n", "│ Row │ mpg │ cylinders │ displacement │ horsepower │ weight │ acceleration │ year │ origin │ name │ brand │\n", "│ │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mString\u001b[39m │ \u001b[90mString\u001b[39m │\n", "├─────┼─────────┼───────────┼──────────────┼────────────┼─────────┼──────────────┼─────────┼─────────┼───────────────────────────┼───────────┤\n", "│ 1 │ 18.0 │ 8.0 │ 307.0 │ 130.0 │ 3504.0 │ 12.0 │ 70.0 │ 1.0 │ chevrolet chevelle malibu │ chevrolet │\n", "│ 2 │ 14.0 │ 8.0 │ 454.0 │ 220.0 │ 4354.0 │ 9.0 │ 70.0 │ 1.0 │ chevrolet impala │ chevrolet │\n", "│ 3 │ 15.0 │ 8.0 │ 400.0 │ 150.0 │ 3761.0 │ 9.5 │ 70.0 │ 1.0 │ chevrolet monte carlo │ chevrolet │\n", "⋮\n", "│ 40 │ 28.0 │ 4.0 │ 112.0 │ 88.0 │ 2605.0 │ 19.6 │ 82.0 │ 1.0 │ chevrolet cavalier │ chevrolet │\n", "│ 41 │ 27.0 │ 4.0 │ 112.0 │ 88.0 │ 2640.0 │ 18.6 │ 82.0 │ 1.0 │ chevrolet cavalier wagon │ chevrolet │\n", "│ 42 │ 34.0 │ 4.0 │ 112.0 │ 88.0 │ 2395.0 │ 18.0 │ 82.0 │ 1.0 │ chevrolet cavalier 2-door │ chevrolet │\n", "│ 43 │ 27.0 │ 4.0 │ 151.0 │ 90.0 │ 2950.0 │ 17.3 │ 82.0 │ 1.0 │ chevrolet camaro │ chevrolet │\n", "⋮\n", "Last Group (1 row): brand = \"nissan\"\n", "│ Row │ mpg │ cylinders │ displacement │ horsepower │ weight │ acceleration │ year │ origin │ name │ brand │\n", "│ │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mString\u001b[39m │ \u001b[90mString\u001b[39m │\n", "├─────┼─────────┼───────────┼──────────────┼────────────┼─────────┼──────────────┼─────────┼─────────┼──────────────────┼────────┤\n", "│ 1 │ 36.0 │ 4.0 │ 120.0 │ 88.0 │ 2160.0 │ 14.5 │ 82.0 │ 3.0 │ nissan stanza xe │ nissan │" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gdf = groupby(df, :brand)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note that it is easy (and fast) to find an appropriate group by passing a `Tuple` of values of grouping columns (in our case it is just one column)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

48 rows × 10 columns

mpgcylindersdisplacementhorsepowerweightaccelerationyearoriginnamebrand
Float64Float64Float64Float64Float64Float64Float64Float64StringString
117.08.0302.0140.03449.010.570.01.0ford torinoford
215.08.0429.0198.04341.010.070.01.0ford galaxie 500ford
321.06.0200.085.02587.016.070.01.0ford maverickford
410.08.0360.0215.04615.014.070.01.0ford f250ford
519.06.0250.088.03302.015.571.01.0ford torino 500ford
614.08.0351.0153.04154.013.571.01.0ford galaxie 500ford
713.08.0400.0170.04746.012.071.01.0ford country squire (sw)ford
818.06.0250.088.03139.014.571.01.0ford mustangford
921.04.0122.086.02226.016.572.01.0ford pinto runaboutford
1014.08.0351.0153.04129.013.072.01.0ford galaxie 500ford
1113.08.0302.0140.04294.016.072.01.0ford gran torino (sw)ford
1222.04.0122.086.02395.016.072.01.0ford pinto (sw)ford
1314.08.0302.0137.04042.014.573.01.0ford gran torinoford
1413.08.0351.0158.04363.013.073.01.0ford ltdford
1518.06.0250.088.03021.016.573.01.0ford maverickford
" ], "text/latex": [ "\\begin{tabular}{r|cccccccccc}\n", "\t& mpg & cylinders & displacement & horsepower & weight & acceleration & year & origin & name & brand\\\\\n", "\t\\hline\n", "\t& Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & String & String\\\\\n", "\t\\hline\n", "\t1 & 17.0 & 8.0 & 302.0 & 140.0 & 3449.0 & 10.5 & 70.0 & 1.0 & ford torino & ford \\\\\n", "\t2 & 15.0 & 8.0 & 429.0 & 198.0 & 4341.0 & 10.0 & 70.0 & 1.0 & ford galaxie 500 & ford \\\\\n", "\t3 & 21.0 & 6.0 & 200.0 & 85.0 & 2587.0 & 16.0 & 70.0 & 1.0 & ford maverick & ford \\\\\n", "\t4 & 10.0 & 8.0 & 360.0 & 215.0 & 4615.0 & 14.0 & 70.0 & 1.0 & ford f250 & ford \\\\\n", "\t5 & 19.0 & 6.0 & 250.0 & 88.0 & 3302.0 & 15.5 & 71.0 & 1.0 & ford torino 500 & ford \\\\\n", "\t6 & 14.0 & 8.0 & 351.0 & 153.0 & 4154.0 & 13.5 & 71.0 & 1.0 & ford galaxie 500 & ford \\\\\n", "\t7 & 13.0 & 8.0 & 400.0 & 170.0 & 4746.0 & 12.0 & 71.0 & 1.0 & ford country squire (sw) & ford \\\\\n", "\t8 & 18.0 & 6.0 & 250.0 & 88.0 & 3139.0 & 14.5 & 71.0 & 1.0 & ford mustang & ford \\\\\n", "\t9 & 21.0 & 4.0 & 122.0 & 86.0 & 2226.0 & 16.5 & 72.0 & 1.0 & ford pinto runabout & ford \\\\\n", "\t10 & 14.0 & 8.0 & 351.0 & 153.0 & 4129.0 & 13.0 & 72.0 & 1.0 & ford galaxie 500 & ford \\\\\n", "\t11 & 13.0 & 8.0 & 302.0 & 140.0 & 4294.0 & 16.0 & 72.0 & 1.0 & ford gran torino (sw) & ford \\\\\n", "\t12 & 22.0 & 4.0 & 122.0 & 86.0 & 2395.0 & 16.0 & 72.0 & 1.0 & ford pinto (sw) & ford \\\\\n", "\t13 & 14.0 & 8.0 & 302.0 & 137.0 & 4042.0 & 14.5 & 73.0 & 1.0 & ford gran torino & ford \\\\\n", "\t14 & 13.0 & 8.0 & 351.0 & 158.0 & 4363.0 & 13.0 & 73.0 & 1.0 & ford ltd & ford \\\\\n", "\t15 & 18.0 & 6.0 & 250.0 & 88.0 & 3021.0 & 16.5 & 73.0 & 1.0 & ford maverick & ford \\\\\n", "\t$\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "48×10 SubDataFrame\n", "│ Row │ mpg │ cylinders │ displacement │ horsepower │ weight │ acceleration │ year │ origin │ name │ brand │\n", "│ │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mString\u001b[39m │ \u001b[90mString\u001b[39m │\n", "├─────┼─────────┼───────────┼──────────────┼────────────┼─────────┼──────────────┼─────────┼─────────┼──────────────────────┼────────┤\n", "│ 1 │ 17.0 │ 8.0 │ 302.0 │ 140.0 │ 3449.0 │ 10.5 │ 70.0 │ 1.0 │ ford torino │ ford │\n", "│ 2 │ 15.0 │ 8.0 │ 429.0 │ 198.0 │ 4341.0 │ 10.0 │ 70.0 │ 1.0 │ ford galaxie 500 │ ford │\n", "│ 3 │ 21.0 │ 6.0 │ 200.0 │ 85.0 │ 2587.0 │ 16.0 │ 70.0 │ 1.0 │ ford maverick │ ford │\n", "⋮\n", "│ 45 │ 24.0 │ 4.0 │ 140.0 │ 92.0 │ 2865.0 │ 16.4 │ 82.0 │ 1.0 │ ford fairmont futura │ ford │\n", "│ 46 │ 22.0 │ 6.0 │ 232.0 │ 112.0 │ 2835.0 │ 14.7 │ 82.0 │ 1.0 │ ford granada l │ ford │\n", "│ 47 │ 27.0 │ 4.0 │ 140.0 │ 86.0 │ 2790.0 │ 15.6 │ 82.0 │ 1.0 │ ford mustang gl │ ford │\n", "│ 48 │ 28.0 │ 4.0 │ 120.0 │ 79.0 │ 2625.0 │ 18.6 │ 82.0 │ 1.0 │ ford ranger │ ford │" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gdf[(\"ford\",)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Using the `combine` function we can easily calculate some aggregates by group:" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

37 rows × 2 columns

brandmpg_mean
StringFloat64
1chevrolet20.4721
2buick19.1824
3plymouth21.7032
4amc18.0704
5ford19.475
6pontiac20.0125
7dodge22.0607
8toyota28.372
9datsun31.113
10volkswagen29.1067
11peugeot23.6875
12audi26.7143
13saab23.9
14bmw23.75
15chevy18.0
" ], "text/latex": [ "\\begin{tabular}{r|cc}\n", "\t& brand & mpg\\_mean\\\\\n", "\t\\hline\n", "\t& String & Float64\\\\\n", "\t\\hline\n", "\t1 & chevrolet & 20.4721 \\\\\n", "\t2 & buick & 19.1824 \\\\\n", "\t3 & plymouth & 21.7032 \\\\\n", "\t4 & amc & 18.0704 \\\\\n", "\t5 & ford & 19.475 \\\\\n", "\t6 & pontiac & 20.0125 \\\\\n", "\t7 & dodge & 22.0607 \\\\\n", "\t8 & toyota & 28.372 \\\\\n", "\t9 & datsun & 31.113 \\\\\n", "\t10 & volkswagen & 29.1067 \\\\\n", "\t11 & peugeot & 23.6875 \\\\\n", "\t12 & audi & 26.7143 \\\\\n", "\t13 & saab & 23.9 \\\\\n", "\t14 & bmw & 23.75 \\\\\n", "\t15 & chevy & 18.0 \\\\\n", "\t$\\dots$ & $\\dots$ & $\\dots$ \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "37×2 DataFrame\n", "│ Row │ brand │ mpg_mean │\n", "│ │ \u001b[90mString\u001b[39m │ \u001b[90mFloat64\u001b[39m │\n", "├─────┼───────────┼──────────┤\n", "│ 1 │ chevrolet │ 20.4721 │\n", "│ 2 │ buick │ 19.1824 │\n", "│ 3 │ plymouth │ 21.7032 │\n", "⋮\n", "│ 34 │ mercedes │ 25.4 │\n", "│ 35 │ vokswagen │ 29.8 │\n", "│ 36 │ triumph │ 35.0 │\n", "│ 37 │ nissan │ 36.0 │" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "brand_mpg = combine(gdf, :mpg => mean)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The `:mpg => mean` syntax means that we pass a column `:mpg` to be processed by the `mean` function for each group.\n", "\n", "We could have added a target column name to the generated column in the following way:" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

37 rows × 2 columns

brandmean_mpg
StringFloat64
1chevrolet20.4721
2buick19.1824
3plymouth21.7032
4amc18.0704
5ford19.475
6pontiac20.0125
7dodge22.0607
8toyota28.372
9datsun31.113
10volkswagen29.1067
11peugeot23.6875
12audi26.7143
13saab23.9
14bmw23.75
15chevy18.0
" ], "text/latex": [ "\\begin{tabular}{r|cc}\n", "\t& brand & mean\\_mpg\\\\\n", "\t\\hline\n", "\t& String & Float64\\\\\n", "\t\\hline\n", "\t1 & chevrolet & 20.4721 \\\\\n", "\t2 & buick & 19.1824 \\\\\n", "\t3 & plymouth & 21.7032 \\\\\n", "\t4 & amc & 18.0704 \\\\\n", "\t5 & ford & 19.475 \\\\\n", "\t6 & pontiac & 20.0125 \\\\\n", "\t7 & dodge & 22.0607 \\\\\n", "\t8 & toyota & 28.372 \\\\\n", "\t9 & datsun & 31.113 \\\\\n", "\t10 & volkswagen & 29.1067 \\\\\n", "\t11 & peugeot & 23.6875 \\\\\n", "\t12 & audi & 26.7143 \\\\\n", "\t13 & saab & 23.9 \\\\\n", "\t14 & bmw & 23.75 \\\\\n", "\t15 & chevy & 18.0 \\\\\n", "\t$\\dots$ & $\\dots$ & $\\dots$ \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "37×2 DataFrame\n", "│ Row │ brand │ mean_mpg │\n", "│ │ \u001b[90mString\u001b[39m │ \u001b[90mFloat64\u001b[39m │\n", "├─────┼───────────┼──────────┤\n", "│ 1 │ chevrolet │ 20.4721 │\n", "│ 2 │ buick │ 19.1824 │\n", "│ 3 │ plymouth │ 21.7032 │\n", "⋮\n", "│ 34 │ mercedes │ 25.4 │\n", "│ 35 │ vokswagen │ 29.8 │\n", "│ 36 │ triumph │ 35.0 │\n", "│ 37 │ nissan │ 36.0 │" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "combine(gdf, :mpg => mean => :mean_mpg)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now let us see how one can sort a data frame. In order to see alll rows of a data frame we temporarily change the number of rows printed." ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "50" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ENV[\"LINES\"] = 50" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "

37 rows × 2 columns

brandmpg_mean
StringFloat64
1vw39.0167
2nissan36.0
3triumph35.0
4honda33.7615
5datsun31.113
6mazda30.86
7subaru30.525
8vokswagen29.8
9renault29.6667
10volkswagen29.1067
11fiat28.9125
12toyota28.372
13audi26.7143
14maxda26.05
15opel25.75
16mercedes25.4
17capri25.0
18saab23.9
19bmw23.75
20peugeot23.6875
21mercedes-benz23.25
22toyouta23.0
23dodge22.0607
24plymouth21.7032
25volvo21.1167
26oldsmobile21.1
27chevrolet20.4721
28pontiac20.0125
29cadillac19.75
30ford19.475
31buick19.1824
32mercury19.1182
33amc18.0704
34chevy18.0
35chrysler17.2667
36chevroelt16.0
37hi9.0
" ], "text/latex": [ "\\begin{tabular}{r|cc}\n", "\t& brand & mpg\\_mean\\\\\n", "\t\\hline\n", "\t& String & Float64\\\\\n", "\t\\hline\n", "\t1 & vw & 39.0167 \\\\\n", "\t2 & nissan & 36.0 \\\\\n", "\t3 & triumph & 35.0 \\\\\n", "\t4 & honda & 33.7615 \\\\\n", "\t5 & datsun & 31.113 \\\\\n", "\t6 & mazda & 30.86 \\\\\n", "\t7 & subaru & 30.525 \\\\\n", "\t8 & vokswagen & 29.8 \\\\\n", "\t9 & renault & 29.6667 \\\\\n", "\t10 & volkswagen & 29.1067 \\\\\n", "\t11 & fiat & 28.9125 \\\\\n", "\t12 & toyota & 28.372 \\\\\n", "\t13 & audi & 26.7143 \\\\\n", "\t14 & maxda & 26.05 \\\\\n", "\t15 & opel & 25.75 \\\\\n", "\t16 & mercedes & 25.4 \\\\\n", "\t17 & capri & 25.0 \\\\\n", "\t18 & saab & 23.9 \\\\\n", "\t19 & bmw & 23.75 \\\\\n", "\t20 & peugeot & 23.6875 \\\\\n", "\t21 & mercedes-benz & 23.25 \\\\\n", "\t22 & toyouta & 23.0 \\\\\n", "\t23 & dodge & 22.0607 \\\\\n", "\t24 & plymouth & 21.7032 \\\\\n", "\t25 & volvo & 21.1167 \\\\\n", "\t26 & oldsmobile & 21.1 \\\\\n", "\t27 & chevrolet & 20.4721 \\\\\n", "\t28 & pontiac & 20.0125 \\\\\n", "\t29 & cadillac & 19.75 \\\\\n", "\t30 & ford & 19.475 \\\\\n", "\t31 & buick & 19.1824 \\\\\n", "\t32 & mercury & 19.1182 \\\\\n", "\t33 & amc & 18.0704 \\\\\n", "\t34 & chevy & 18.0 \\\\\n", "\t35 & chrysler & 17.2667 \\\\\n", "\t36 & chevroelt & 16.0 \\\\\n", "\t37 & hi & 9.0 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "37×2 DataFrame\n", "│ Row │ brand │ mpg_mean │\n", "│ │ \u001b[90mString\u001b[39m │ \u001b[90mFloat64\u001b[39m │\n", "├─────┼───────────────┼──────────┤\n", "│ 1 │ vw │ 39.0167 │\n", "│ 2 │ nissan │ 36.0 │\n", "│ 3 │ triumph │ 35.0 │\n", "│ 4 │ honda │ 33.7615 │\n", "│ 5 │ datsun │ 31.113 │\n", "│ 6 │ mazda │ 30.86 │\n", "│ 7 │ subaru │ 30.525 │\n", "│ 8 │ vokswagen │ 29.8 │\n", "│ 9 │ renault │ 29.6667 │\n", "│ 10 │ volkswagen │ 29.1067 │\n", "│ 11 │ fiat │ 28.9125 │\n", "│ 12 │ toyota │ 28.372 │\n", "│ 13 │ audi │ 26.7143 │\n", "│ 14 │ maxda │ 26.05 │\n", "│ 15 │ opel │ 25.75 │\n", "│ 16 │ mercedes │ 25.4 │\n", "│ 17 │ capri │ 25.0 │\n", "│ 18 │ saab │ 23.9 │\n", "│ 19 │ bmw │ 23.75 │\n", "│ 20 │ peugeot │ 23.6875 │\n", "│ 21 │ mercedes-benz │ 23.25 │\n", "│ 22 │ toyouta │ 23.0 │\n", "│ 23 │ dodge │ 22.0607 │\n", "│ 24 │ plymouth │ 21.7032 │\n", "│ 25 │ volvo │ 21.1167 │\n", "│ 26 │ oldsmobile │ 21.1 │\n", "│ 27 │ chevrolet │ 20.4721 │\n", "│ 28 │ pontiac │ 20.0125 │\n", "│ 29 │ cadillac │ 19.75 │\n", "│ 30 │ ford │ 19.475 │\n", "│ 31 │ buick │ 19.1824 │\n", "│ 32 │ mercury │ 19.1182 │\n", "│ 33 │ amc │ 18.0704 │\n", "│ 34 │ chevy │ 18.0 │\n", "│ 35 │ chrysler │ 17.2667 │\n", "│ 36 │ chevroelt │ 16.0 │\n", "│ 37 │ hi │ 9.0 │" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sort!(brand_mpg, :mpg_mean, rev=true) # sort! performs an in-place operation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A typical data cleaning task is to check its consistency. In this case it would be making sure that each brand has a unique origin. We will do it in several ways." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "First we use FreqTables.jl:" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "37×3 Named Array{Int64,2}\n", "brand ╲ origin │ 1.0 2.0 3.0\n", "───────────────┼──────────────\n", "amc │ 27 0 0\n", "audi │ 0 7 0\n", "bmw │ 0 2 0\n", "buick │ 17 0 0\n", "cadillac │ 2 0 0\n", "capri │ 1 0 0\n", "chevroelt │ 1 0 0\n", "chevrolet │ 43 0 0\n", "chevy │ 3 0 0\n", "chrysler │ 6 0 0\n", "datsun │ 0 0 23\n", "dodge │ 28 0 0\n", "fiat │ 0 8 0\n", "ford │ 48 0 0\n", "hi │ 1 0 0\n", "honda │ 0 0 13\n", "maxda │ 0 0 2\n", "mazda │ 0 0 10\n", "mercedes │ 0 1 0\n", "mercedes-benz │ 0 2 0\n", "mercury │ 11 0 0\n", "nissan │ 0 0 1\n", "oldsmobile │ 10 0 0\n", "opel │ 0 4 0\n", "peugeot │ 0 8 0\n", "plymouth │ 31 0 0\n", "pontiac │ 16 0 0\n", "renault │ 0 3 0\n", "saab │ 0 4 0\n", "subaru │ 0 0 4\n", "toyota │ 0 0 25\n", "toyouta │ 0 0 1\n", "triumph │ 0 1 0\n", "vokswagen │ 0 1 0\n", "volkswagen │ 0 15 0\n", "volvo │ 0 6 0\n", "vw │ 0 6 0" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "freqtable(df, :brand, :origin)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now let us create a chain of operations using Pipe.jl. Here `_` denotes the position of the argument passed from the previous step in the chain." ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

37 rows × 2 columns

brandorigin_function
StringInt64
1chevrolet1
2buick1
3plymouth1
4amc1
5ford1
6pontiac1
7dodge1
8toyota1
9datsun1
10volkswagen1
11peugeot1
12audi1
13saab1
14bmw1
15chevy1
16hi1
17mercury1
18opel1
19fiat1
20oldsmobile1
21chrysler1
22mazda1
23volvo1
24renault1
25toyouta1
26maxda1
27honda1
28subaru1
29chevroelt1
30capri1
31vw1
32mercedes-benz1
33cadillac1
34mercedes1
35vokswagen1
36triumph1
37nissan1
" ], "text/latex": [ "\\begin{tabular}{r|cc}\n", "\t& brand & origin\\_function\\\\\n", "\t\\hline\n", "\t& String & Int64\\\\\n", "\t\\hline\n", "\t1 & chevrolet & 1 \\\\\n", "\t2 & buick & 1 \\\\\n", "\t3 & plymouth & 1 \\\\\n", "\t4 & amc & 1 \\\\\n", "\t5 & ford & 1 \\\\\n", "\t6 & pontiac & 1 \\\\\n", "\t7 & dodge & 1 \\\\\n", "\t8 & toyota & 1 \\\\\n", "\t9 & datsun & 1 \\\\\n", "\t10 & volkswagen & 1 \\\\\n", "\t11 & peugeot & 1 \\\\\n", "\t12 & audi & 1 \\\\\n", "\t13 & saab & 1 \\\\\n", "\t14 & bmw & 1 \\\\\n", "\t15 & chevy & 1 \\\\\n", "\t16 & hi & 1 \\\\\n", "\t17 & mercury & 1 \\\\\n", "\t18 & opel & 1 \\\\\n", "\t19 & fiat & 1 \\\\\n", "\t20 & oldsmobile & 1 \\\\\n", "\t21 & chrysler & 1 \\\\\n", "\t22 & mazda & 1 \\\\\n", "\t23 & volvo & 1 \\\\\n", "\t24 & renault & 1 \\\\\n", "\t25 & toyouta & 1 \\\\\n", "\t26 & maxda & 1 \\\\\n", "\t27 & honda & 1 \\\\\n", "\t28 & subaru & 1 \\\\\n", "\t29 & chevroelt & 1 \\\\\n", "\t30 & capri & 1 \\\\\n", "\t31 & vw & 1 \\\\\n", "\t32 & mercedes-benz & 1 \\\\\n", "\t33 & cadillac & 1 \\\\\n", "\t34 & mercedes & 1 \\\\\n", "\t35 & vokswagen & 1 \\\\\n", "\t36 & triumph & 1 \\\\\n", "\t37 & nissan & 1 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "37×2 DataFrame\n", "│ Row │ brand │ origin_function │\n", "│ │ \u001b[90mString\u001b[39m │ \u001b[90mInt64\u001b[39m │\n", "├─────┼───────────────┼─────────────────┤\n", "│ 1 │ chevrolet │ 1 │\n", "│ 2 │ buick │ 1 │\n", "│ 3 │ plymouth │ 1 │\n", "│ 4 │ amc │ 1 │\n", "│ 5 │ ford │ 1 │\n", "│ 6 │ pontiac │ 1 │\n", "│ 7 │ dodge │ 1 │\n", "│ 8 │ toyota │ 1 │\n", "│ 9 │ datsun │ 1 │\n", "│ 10 │ volkswagen │ 1 │\n", "│ 11 │ peugeot │ 1 │\n", "│ 12 │ audi │ 1 │\n", "│ 13 │ saab │ 1 │\n", "│ 14 │ bmw │ 1 │\n", "│ 15 │ chevy │ 1 │\n", "│ 16 │ hi │ 1 │\n", "│ 17 │ mercury │ 1 │\n", "│ 18 │ opel │ 1 │\n", "│ 19 │ fiat │ 1 │\n", "│ 20 │ oldsmobile │ 1 │\n", "│ 21 │ chrysler │ 1 │\n", "│ 22 │ mazda │ 1 │\n", "│ 23 │ volvo │ 1 │\n", "│ 24 │ renault │ 1 │\n", "│ 25 │ toyouta │ 1 │\n", "│ 26 │ maxda │ 1 │\n", "│ 27 │ honda │ 1 │\n", "│ 28 │ subaru │ 1 │\n", "│ 29 │ chevroelt │ 1 │\n", "│ 30 │ capri │ 1 │\n", "│ 31 │ vw │ 1 │\n", "│ 32 │ mercedes-benz │ 1 │\n", "│ 33 │ cadillac │ 1 │\n", "│ 34 │ mercedes │ 1 │\n", "│ 35 │ vokswagen │ 1 │\n", "│ 36 │ triumph │ 1 │\n", "│ 37 │ nissan │ 1 │" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "orig_brand = @pipe df |>\n", " groupby(_, :brand) |>\n", " combine(_, :origin => x -> length(unique(x)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let us make sure that we always get exactly one origin:" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1, 1)" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "extrema(orig_brand.origin_function)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We also could have grouped our data frame by two columns:" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

37 rows × 3 columns

originbrandnrow
Float64StringInt64
11.0chevrolet43
21.0buick17
31.0plymouth31
41.0amc27
51.0ford48
61.0pontiac16
71.0dodge28
83.0toyota25
93.0datsun23
102.0volkswagen15
112.0peugeot8
122.0audi7
132.0saab4
142.0bmw2
151.0chevy3
161.0hi1
171.0mercury11
182.0opel4
192.0fiat8
201.0oldsmobile10
211.0chrysler6
223.0mazda10
232.0volvo6
242.0renault3
253.0toyouta1
263.0maxda2
273.0honda13
283.0subaru4
291.0chevroelt1
301.0capri1
312.0vw6
322.0mercedes-benz2
331.0cadillac2
342.0mercedes1
352.0vokswagen1
362.0triumph1
373.0nissan1
" ], "text/latex": [ "\\begin{tabular}{r|ccc}\n", "\t& origin & brand & nrow\\\\\n", "\t\\hline\n", "\t& Float64 & String & Int64\\\\\n", "\t\\hline\n", "\t1 & 1.0 & chevrolet & 43 \\\\\n", "\t2 & 1.0 & buick & 17 \\\\\n", "\t3 & 1.0 & plymouth & 31 \\\\\n", "\t4 & 1.0 & amc & 27 \\\\\n", "\t5 & 1.0 & ford & 48 \\\\\n", "\t6 & 1.0 & pontiac & 16 \\\\\n", "\t7 & 1.0 & dodge & 28 \\\\\n", "\t8 & 3.0 & toyota & 25 \\\\\n", "\t9 & 3.0 & datsun & 23 \\\\\n", "\t10 & 2.0 & volkswagen & 15 \\\\\n", "\t11 & 2.0 & peugeot & 8 \\\\\n", "\t12 & 2.0 & audi & 7 \\\\\n", "\t13 & 2.0 & saab & 4 \\\\\n", "\t14 & 2.0 & bmw & 2 \\\\\n", "\t15 & 1.0 & chevy & 3 \\\\\n", "\t16 & 1.0 & hi & 1 \\\\\n", "\t17 & 1.0 & mercury & 11 \\\\\n", "\t18 & 2.0 & opel & 4 \\\\\n", "\t19 & 2.0 & fiat & 8 \\\\\n", "\t20 & 1.0 & oldsmobile & 10 \\\\\n", "\t21 & 1.0 & chrysler & 6 \\\\\n", "\t22 & 3.0 & mazda & 10 \\\\\n", "\t23 & 2.0 & volvo & 6 \\\\\n", "\t24 & 2.0 & renault & 3 \\\\\n", "\t25 & 3.0 & toyouta & 1 \\\\\n", "\t26 & 3.0 & maxda & 2 \\\\\n", "\t27 & 3.0 & honda & 13 \\\\\n", "\t28 & 3.0 & subaru & 4 \\\\\n", "\t29 & 1.0 & chevroelt & 1 \\\\\n", "\t30 & 1.0 & capri & 1 \\\\\n", "\t31 & 2.0 & vw & 6 \\\\\n", "\t32 & 2.0 & mercedes-benz & 2 \\\\\n", "\t33 & 1.0 & cadillac & 2 \\\\\n", "\t34 & 2.0 & mercedes & 1 \\\\\n", "\t35 & 2.0 & vokswagen & 1 \\\\\n", "\t36 & 2.0 & triumph & 1 \\\\\n", "\t37 & 3.0 & nissan & 1 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "37×3 DataFrame\n", "│ Row │ origin │ brand │ nrow │\n", "│ │ \u001b[90mFloat64\u001b[39m │ \u001b[90mString\u001b[39m │ \u001b[90mInt64\u001b[39m │\n", "├─────┼─────────┼───────────────┼───────┤\n", "│ 1 │ 1.0 │ chevrolet │ 43 │\n", "│ 2 │ 1.0 │ buick │ 17 │\n", "│ 3 │ 1.0 │ plymouth │ 31 │\n", "│ 4 │ 1.0 │ amc │ 27 │\n", "│ 5 │ 1.0 │ ford │ 48 │\n", "│ 6 │ 1.0 │ pontiac │ 16 │\n", "│ 7 │ 1.0 │ dodge │ 28 │\n", "│ 8 │ 3.0 │ toyota │ 25 │\n", "│ 9 │ 3.0 │ datsun │ 23 │\n", "│ 10 │ 2.0 │ volkswagen │ 15 │\n", "│ 11 │ 2.0 │ peugeot │ 8 │\n", "│ 12 │ 2.0 │ audi │ 7 │\n", "│ 13 │ 2.0 │ saab │ 4 │\n", "│ 14 │ 2.0 │ bmw │ 2 │\n", "│ 15 │ 1.0 │ chevy │ 3 │\n", "│ 16 │ 1.0 │ hi │ 1 │\n", "│ 17 │ 1.0 │ mercury │ 11 │\n", "│ 18 │ 2.0 │ opel │ 4 │\n", "│ 19 │ 2.0 │ fiat │ 8 │\n", "│ 20 │ 1.0 │ oldsmobile │ 10 │\n", "│ 21 │ 1.0 │ chrysler │ 6 │\n", "│ 22 │ 3.0 │ mazda │ 10 │\n", "│ 23 │ 2.0 │ volvo │ 6 │\n", "│ 24 │ 2.0 │ renault │ 3 │\n", "│ 25 │ 3.0 │ toyouta │ 1 │\n", "│ 26 │ 3.0 │ maxda │ 2 │\n", "│ 27 │ 3.0 │ honda │ 13 │\n", "│ 28 │ 3.0 │ subaru │ 4 │\n", "│ 29 │ 1.0 │ chevroelt │ 1 │\n", "│ 30 │ 1.0 │ capri │ 1 │\n", "│ 31 │ 2.0 │ vw │ 6 │\n", "│ 32 │ 2.0 │ mercedes-benz │ 2 │\n", "│ 33 │ 1.0 │ cadillac │ 2 │\n", "│ 34 │ 2.0 │ mercedes │ 1 │\n", "│ 35 │ 2.0 │ vokswagen │ 1 │\n", "│ 36 │ 2.0 │ triumph │ 1 │\n", "│ 37 │ 3.0 │ nissan │ 1 │" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "origin_brand2 = @pipe df |>\n", " groupby(_, [:origin, :brand]) |>\n", " combine(_, nrow)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note that passing `nrow` as a combining operation gives us the number of rows per group." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we can do the `unstack` operation on the result to get a similar table that we got with `freqtable`:" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

37 rows × 4 columns

brand1.02.03.0
StringInt64?Int64?Int64?
1amc27missingmissing
2audimissing7missing
3bmwmissing2missing
4buick17missingmissing
5cadillac2missingmissing
6capri1missingmissing
7chevroelt1missingmissing
8chevrolet43missingmissing
9chevy3missingmissing
10chrysler6missingmissing
11datsunmissingmissing23
12dodge28missingmissing
13fiatmissing8missing
14ford48missingmissing
15hi1missingmissing
16hondamissingmissing13
17maxdamissingmissing2
18mazdamissingmissing10
19mercedesmissing1missing
20mercedes-benzmissing2missing
21mercury11missingmissing
22nissanmissingmissing1
23oldsmobile10missingmissing
24opelmissing4missing
25peugeotmissing8missing
26plymouth31missingmissing
27pontiac16missingmissing
28renaultmissing3missing
29saabmissing4missing
30subarumissingmissing4
31toyotamissingmissing25
32toyoutamissingmissing1
33triumphmissing1missing
34vokswagenmissing1missing
35volkswagenmissing15missing
36volvomissing6missing
37vwmissing6missing
" ], "text/latex": [ "\\begin{tabular}{r|cccc}\n", "\t& brand & 1.0 & 2.0 & 3.0\\\\\n", "\t\\hline\n", "\t& String & Int64? & Int64? & Int64?\\\\\n", "\t\\hline\n", "\t1 & amc & 27 & \\emph{missing} & \\emph{missing} \\\\\n", "\t2 & audi & \\emph{missing} & 7 & \\emph{missing} \\\\\n", "\t3 & bmw & \\emph{missing} & 2 & \\emph{missing} \\\\\n", "\t4 & buick & 17 & \\emph{missing} & \\emph{missing} \\\\\n", "\t5 & cadillac & 2 & \\emph{missing} & \\emph{missing} \\\\\n", "\t6 & capri & 1 & \\emph{missing} & \\emph{missing} \\\\\n", "\t7 & chevroelt & 1 & \\emph{missing} & \\emph{missing} \\\\\n", "\t8 & chevrolet & 43 & \\emph{missing} & \\emph{missing} \\\\\n", "\t9 & chevy & 3 & \\emph{missing} & \\emph{missing} \\\\\n", "\t10 & chrysler & 6 & \\emph{missing} & \\emph{missing} \\\\\n", "\t11 & datsun & \\emph{missing} & \\emph{missing} & 23 \\\\\n", "\t12 & dodge & 28 & \\emph{missing} & \\emph{missing} \\\\\n", "\t13 & fiat & \\emph{missing} & 8 & \\emph{missing} \\\\\n", "\t14 & ford & 48 & \\emph{missing} & \\emph{missing} \\\\\n", "\t15 & hi & 1 & \\emph{missing} & \\emph{missing} \\\\\n", "\t16 & honda & \\emph{missing} & \\emph{missing} & 13 \\\\\n", "\t17 & maxda & \\emph{missing} & \\emph{missing} & 2 \\\\\n", "\t18 & mazda & \\emph{missing} & \\emph{missing} & 10 \\\\\n", "\t19 & mercedes & \\emph{missing} & 1 & \\emph{missing} \\\\\n", "\t20 & mercedes-benz & \\emph{missing} & 2 & \\emph{missing} \\\\\n", "\t21 & mercury & 11 & \\emph{missing} & \\emph{missing} \\\\\n", "\t22 & nissan & \\emph{missing} & \\emph{missing} & 1 \\\\\n", "\t23 & oldsmobile & 10 & \\emph{missing} & \\emph{missing} \\\\\n", "\t24 & opel & \\emph{missing} & 4 & \\emph{missing} \\\\\n", "\t25 & peugeot & \\emph{missing} & 8 & \\emph{missing} \\\\\n", "\t26 & plymouth & 31 & \\emph{missing} & \\emph{missing} \\\\\n", "\t27 & pontiac & 16 & \\emph{missing} & \\emph{missing} \\\\\n", "\t28 & renault & \\emph{missing} & 3 & \\emph{missing} \\\\\n", "\t29 & saab & \\emph{missing} & 4 & \\emph{missing} \\\\\n", "\t30 & subaru & \\emph{missing} & \\emph{missing} & 4 \\\\\n", "\t31 & toyota & \\emph{missing} & \\emph{missing} & 25 \\\\\n", "\t32 & toyouta & \\emph{missing} & \\emph{missing} & 1 \\\\\n", "\t33 & triumph & \\emph{missing} & 1 & \\emph{missing} \\\\\n", "\t34 & vokswagen & \\emph{missing} & 1 & \\emph{missing} \\\\\n", "\t35 & volkswagen & \\emph{missing} & 15 & \\emph{missing} \\\\\n", "\t36 & volvo & \\emph{missing} & 6 & \\emph{missing} \\\\\n", "\t37 & vw & \\emph{missing} & 6 & \\emph{missing} \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "37×4 DataFrame\n", "│ Row │ brand │ 1.0 │ 2.0 │ 3.0 │\n", "│ │ \u001b[90mString\u001b[39m │ \u001b[90mInt64?\u001b[39m │ \u001b[90mInt64?\u001b[39m │ \u001b[90mInt64?\u001b[39m │\n", "├─────┼───────────────┼─────────┼─────────┼─────────┤\n", "│ 1 │ amc │ 27 │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │\n", "│ 2 │ audi │ \u001b[90mmissing\u001b[39m │ 7 │ \u001b[90mmissing\u001b[39m │\n", "│ 3 │ bmw │ \u001b[90mmissing\u001b[39m │ 2 │ \u001b[90mmissing\u001b[39m │\n", "│ 4 │ buick │ 17 │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │\n", "│ 5 │ cadillac │ 2 │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │\n", "│ 6 │ capri │ 1 │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │\n", "│ 7 │ chevroelt │ 1 │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │\n", "│ 8 │ chevrolet │ 43 │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │\n", "│ 9 │ chevy │ 3 │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │\n", "│ 10 │ chrysler │ 6 │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │\n", "│ 11 │ datsun │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │ 23 │\n", "│ 12 │ dodge │ 28 │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │\n", "│ 13 │ fiat │ \u001b[90mmissing\u001b[39m │ 8 │ \u001b[90mmissing\u001b[39m │\n", "│ 14 │ ford │ 48 │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │\n", "│ 15 │ hi │ 1 │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │\n", "│ 16 │ honda │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │ 13 │\n", "│ 17 │ maxda │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │ 2 │\n", "│ 18 │ mazda │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │ 10 │\n", "│ 19 │ mercedes │ \u001b[90mmissing\u001b[39m │ 1 │ \u001b[90mmissing\u001b[39m │\n", "│ 20 │ mercedes-benz │ \u001b[90mmissing\u001b[39m │ 2 │ \u001b[90mmissing\u001b[39m │\n", "│ 21 │ mercury │ 11 │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │\n", "│ 22 │ nissan │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │ 1 │\n", "│ 23 │ oldsmobile │ 10 │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │\n", "│ 24 │ opel │ \u001b[90mmissing\u001b[39m │ 4 │ \u001b[90mmissing\u001b[39m │\n", "│ 25 │ peugeot │ \u001b[90mmissing\u001b[39m │ 8 │ \u001b[90mmissing\u001b[39m │\n", "│ 26 │ plymouth │ 31 │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │\n", "│ 27 │ pontiac │ 16 │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │\n", "│ 28 │ renault │ \u001b[90mmissing\u001b[39m │ 3 │ \u001b[90mmissing\u001b[39m │\n", "│ 29 │ saab │ \u001b[90mmissing\u001b[39m │ 4 │ \u001b[90mmissing\u001b[39m │\n", "│ 30 │ subaru │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │ 4 │\n", "│ 31 │ toyota │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │ 25 │\n", "│ 32 │ toyouta │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │ 1 │\n", "│ 33 │ triumph │ \u001b[90mmissing\u001b[39m │ 1 │ \u001b[90mmissing\u001b[39m │\n", "│ 34 │ vokswagen │ \u001b[90mmissing\u001b[39m │ 1 │ \u001b[90mmissing\u001b[39m │\n", "│ 35 │ volkswagen │ \u001b[90mmissing\u001b[39m │ 15 │ \u001b[90mmissing\u001b[39m │\n", "│ 36 │ volvo │ \u001b[90mmissing\u001b[39m │ 6 │ \u001b[90mmissing\u001b[39m │\n", "│ 37 │ vw │ \u001b[90mmissing\u001b[39m │ 6 │ \u001b[90mmissing\u001b[39m │" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "origin_vs_brand = unstack(origin_brand2, :brand, :origin, :nrow)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The only difference is that missing combinations are given as `missing` and `freqtable` produced `0` there. It is easy to fix using broadcasting:" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

37 rows × 4 columns

brand1.02.03.0
StringInt64Int64Int64
1amc2700
2audi070
3bmw020
4buick1700
5cadillac200
6capri100
7chevroelt100
8chevrolet4300
9chevy300
10chrysler600
11datsun0023
12dodge2800
13fiat080
14ford4800
15hi100
16honda0013
17maxda002
18mazda0010
19mercedes010
20mercedes-benz020
21mercury1100
22nissan001
23oldsmobile1000
24opel040
25peugeot080
26plymouth3100
27pontiac1600
28renault030
29saab040
30subaru004
31toyota0025
32toyouta001
33triumph010
34vokswagen010
35volkswagen0150
36volvo060
37vw060
" ], "text/latex": [ "\\begin{tabular}{r|cccc}\n", "\t& brand & 1.0 & 2.0 & 3.0\\\\\n", "\t\\hline\n", "\t& String & Int64 & Int64 & Int64\\\\\n", "\t\\hline\n", "\t1 & amc & 27 & 0 & 0 \\\\\n", "\t2 & audi & 0 & 7 & 0 \\\\\n", "\t3 & bmw & 0 & 2 & 0 \\\\\n", "\t4 & buick & 17 & 0 & 0 \\\\\n", "\t5 & cadillac & 2 & 0 & 0 \\\\\n", "\t6 & capri & 1 & 0 & 0 \\\\\n", "\t7 & chevroelt & 1 & 0 & 0 \\\\\n", "\t8 & chevrolet & 43 & 0 & 0 \\\\\n", "\t9 & chevy & 3 & 0 & 0 \\\\\n", "\t10 & chrysler & 6 & 0 & 0 \\\\\n", "\t11 & datsun & 0 & 0 & 23 \\\\\n", "\t12 & dodge & 28 & 0 & 0 \\\\\n", "\t13 & fiat & 0 & 8 & 0 \\\\\n", "\t14 & ford & 48 & 0 & 0 \\\\\n", "\t15 & hi & 1 & 0 & 0 \\\\\n", "\t16 & honda & 0 & 0 & 13 \\\\\n", "\t17 & maxda & 0 & 0 & 2 \\\\\n", "\t18 & mazda & 0 & 0 & 10 \\\\\n", "\t19 & mercedes & 0 & 1 & 0 \\\\\n", "\t20 & mercedes-benz & 0 & 2 & 0 \\\\\n", "\t21 & mercury & 11 & 0 & 0 \\\\\n", "\t22 & nissan & 0 & 0 & 1 \\\\\n", "\t23 & oldsmobile & 10 & 0 & 0 \\\\\n", "\t24 & opel & 0 & 4 & 0 \\\\\n", "\t25 & peugeot & 0 & 8 & 0 \\\\\n", "\t26 & plymouth & 31 & 0 & 0 \\\\\n", "\t27 & pontiac & 16 & 0 & 0 \\\\\n", "\t28 & renault & 0 & 3 & 0 \\\\\n", "\t29 & saab & 0 & 4 & 0 \\\\\n", "\t30 & subaru & 0 & 0 & 4 \\\\\n", "\t31 & toyota & 0 & 0 & 25 \\\\\n", "\t32 & toyouta & 0 & 0 & 1 \\\\\n", "\t33 & triumph & 0 & 1 & 0 \\\\\n", "\t34 & vokswagen & 0 & 1 & 0 \\\\\n", "\t35 & volkswagen & 0 & 15 & 0 \\\\\n", "\t36 & volvo & 0 & 6 & 0 \\\\\n", "\t37 & vw & 0 & 6 & 0 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "37×4 DataFrame\n", "│ Row │ brand │ 1.0 │ 2.0 │ 3.0 │\n", "│ │ \u001b[90mString\u001b[39m │ \u001b[90mInt64\u001b[39m │ \u001b[90mInt64\u001b[39m │ \u001b[90mInt64\u001b[39m │\n", "├─────┼───────────────┼───────┼───────┼───────┤\n", "│ 1 │ amc │ 27 │ 0 │ 0 │\n", "│ 2 │ audi │ 0 │ 7 │ 0 │\n", "│ 3 │ bmw │ 0 │ 2 │ 0 │\n", "│ 4 │ buick │ 17 │ 0 │ 0 │\n", "│ 5 │ cadillac │ 2 │ 0 │ 0 │\n", "│ 6 │ capri │ 1 │ 0 │ 0 │\n", "│ 7 │ chevroelt │ 1 │ 0 │ 0 │\n", "│ 8 │ chevrolet │ 43 │ 0 │ 0 │\n", "│ 9 │ chevy │ 3 │ 0 │ 0 │\n", "│ 10 │ chrysler │ 6 │ 0 │ 0 │\n", "│ 11 │ datsun │ 0 │ 0 │ 23 │\n", "│ 12 │ dodge │ 28 │ 0 │ 0 │\n", "│ 13 │ fiat │ 0 │ 8 │ 0 │\n", "│ 14 │ ford │ 48 │ 0 │ 0 │\n", "│ 15 │ hi │ 1 │ 0 │ 0 │\n", "│ 16 │ honda │ 0 │ 0 │ 13 │\n", "│ 17 │ maxda │ 0 │ 0 │ 2 │\n", "│ 18 │ mazda │ 0 │ 0 │ 10 │\n", "│ 19 │ mercedes │ 0 │ 1 │ 0 │\n", "│ 20 │ mercedes-benz │ 0 │ 2 │ 0 │\n", "│ 21 │ mercury │ 11 │ 0 │ 0 │\n", "│ 22 │ nissan │ 0 │ 0 │ 1 │\n", "│ 23 │ oldsmobile │ 10 │ 0 │ 0 │\n", "│ 24 │ opel │ 0 │ 4 │ 0 │\n", "│ 25 │ peugeot │ 0 │ 8 │ 0 │\n", "│ 26 │ plymouth │ 31 │ 0 │ 0 │\n", "│ 27 │ pontiac │ 16 │ 0 │ 0 │\n", "│ 28 │ renault │ 0 │ 3 │ 0 │\n", "│ 29 │ saab │ 0 │ 4 │ 0 │\n", "│ 30 │ subaru │ 0 │ 0 │ 4 │\n", "│ 31 │ toyota │ 0 │ 0 │ 25 │\n", "│ 32 │ toyouta │ 0 │ 0 │ 1 │\n", "│ 33 │ triumph │ 0 │ 1 │ 0 │\n", "│ 34 │ vokswagen │ 0 │ 1 │ 0 │\n", "│ 35 │ volkswagen │ 0 │ 15 │ 0 │\n", "│ 36 │ volvo │ 0 │ 6 │ 0 │\n", "│ 37 │ vw │ 0 │ 6 │ 0 │" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "coalesce.(origin_vs_brand, 0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note that column names of a data frame can be any sequence of characters as in this case:" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "4-element Array{String,1}:\n", " \"brand\"\n", " \"1.0\"\n", " \"2.0\"\n", " \"3.0\"" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "names(origin_vs_brand)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "4-element Array{Symbol,1}:\n", " :brand\n", " Symbol(\"1.0\")\n", " Symbol(\"2.0\")\n", " Symbol(\"3.0\")" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "propertynames(origin_vs_brand)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In such situations it is easiest to refer to such columns using strings, as `Symbol`s are not super easy to write when they have to represent sequences that are not allowed as variable names as you can see above." ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "37-element Array{Union{Missing, Int64},1}:\n", " 27\n", " missing\n", " missing\n", " 17\n", " 2\n", " 1\n", " 1\n", " 43\n", " 3\n", " 6\n", " missing\n", " 28\n", " missing\n", " 48\n", " 1\n", " missing\n", " missing\n", " missing\n", " missing\n", " missing\n", " 11\n", " missing\n", " 10\n", " missing\n", " missing\n", " 31\n", " 16\n", " missing\n", " missing\n", " missing\n", " missing\n", " missing\n", " missing\n", " missing\n", " missing\n", " missing\n", " missing" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "origin_vs_brand.\"1.0\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Finally let us show that columns of a data frame can hold any data type." ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

3 rows × 2 columns

originbrand_function
Float64Array…
11.0[\"chevrolet\", \"buick\", \"plymouth\", \"amc\", \"ford\", \"pontiac\", \"dodge\", \"chevy\", \"hi\", \"mercury\", \"oldsmobile\", \"chrysler\", \"chevroelt\", \"capri\", \"cadillac\"]
23.0[\"toyota\", \"datsun\", \"mazda\", \"toyouta\", \"maxda\", \"honda\", \"subaru\", \"nissan\"]
32.0[\"volkswagen\", \"peugeot\", \"audi\", \"saab\", \"bmw\", \"opel\", \"fiat\", \"volvo\", \"renault\", \"vw\", \"mercedes-benz\", \"mercedes\", \"vokswagen\", \"triumph\"]
" ], "text/latex": [ "\\begin{tabular}{r|cc}\n", "\t& origin & brand\\_function\\\\\n", "\t\\hline\n", "\t& Float64 & Array…\\\\\n", "\t\\hline\n", "\t1 & 1.0 & [\"chevrolet\", \"buick\", \"plymouth\", \"amc\", \"ford\", \"pontiac\", \"dodge\", \"chevy\", \"hi\", \"mercury\", \"oldsmobile\", \"chrysler\", \"chevroelt\", \"capri\", \"cadillac\"] \\\\\n", "\t2 & 3.0 & [\"toyota\", \"datsun\", \"mazda\", \"toyouta\", \"maxda\", \"honda\", \"subaru\", \"nissan\"] \\\\\n", "\t3 & 2.0 & [\"volkswagen\", \"peugeot\", \"audi\", \"saab\", \"bmw\", \"opel\", \"fiat\", \"volvo\", \"renault\", \"vw\", \"mercedes-benz\", \"mercedes\", \"vokswagen\", \"triumph\"] \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "3×2 DataFrame\n", "│ Row │ origin │ brand_function │\n", "│ │ \u001b[90mFloat64\u001b[39m │ \u001b[90mArray{String,1}\u001b[39m │\n", "├─────┼─────────┼─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤\n", "│ 1 │ 1.0 │ [\"chevrolet\", \"buick\", \"plymouth\", \"amc\", \"ford\", \"pontiac\", \"dodge\", \"chevy\", \"hi\", \"mercury\", \"oldsmobile\", \"chrysler\", \"chevroelt\", \"capri\", \"cadillac\"] │\n", "│ 2 │ 3.0 │ [\"toyota\", \"datsun\", \"mazda\", \"toyouta\", \"maxda\", \"honda\", \"subaru\", \"nissan\"] │\n", "│ 3 │ 2.0 │ [\"volkswagen\", \"peugeot\", \"audi\", \"saab\", \"bmw\", \"opel\", \"fiat\", \"volvo\", \"renault\", \"vw\", \"mercedes-benz\", \"mercedes\", \"vokswagen\", \"triumph\"] │" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "origin_brand3 = @pipe df |>\n", " groupby(_, :origin) |>\n", " combine(_, :brand => x -> Ref(unique(x)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note that we used `Ref` to protect the return value of our function to be broadcasted (this is the same trick as in standard broadcasting in Julia). Otherwise we would get:" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

37 rows × 2 columns

originbrand_unique
Float64String
11.0chevrolet
21.0buick
31.0plymouth
41.0amc
51.0ford
61.0pontiac
71.0dodge
81.0chevy
91.0hi
101.0mercury
111.0oldsmobile
121.0chrysler
131.0chevroelt
141.0capri
151.0cadillac
163.0toyota
173.0datsun
183.0mazda
193.0toyouta
203.0maxda
213.0honda
223.0subaru
233.0nissan
242.0volkswagen
252.0peugeot
262.0audi
272.0saab
282.0bmw
292.0opel
302.0fiat
312.0volvo
322.0renault
332.0vw
342.0mercedes-benz
352.0mercedes
362.0vokswagen
372.0triumph
" ], "text/latex": [ "\\begin{tabular}{r|cc}\n", "\t& origin & brand\\_unique\\\\\n", "\t\\hline\n", "\t& Float64 & String\\\\\n", "\t\\hline\n", "\t1 & 1.0 & chevrolet \\\\\n", "\t2 & 1.0 & buick \\\\\n", "\t3 & 1.0 & plymouth \\\\\n", "\t4 & 1.0 & amc \\\\\n", "\t5 & 1.0 & ford \\\\\n", "\t6 & 1.0 & pontiac \\\\\n", "\t7 & 1.0 & dodge \\\\\n", "\t8 & 1.0 & chevy \\\\\n", "\t9 & 1.0 & hi \\\\\n", "\t10 & 1.0 & mercury \\\\\n", "\t11 & 1.0 & oldsmobile \\\\\n", "\t12 & 1.0 & chrysler \\\\\n", "\t13 & 1.0 & chevroelt \\\\\n", "\t14 & 1.0 & capri \\\\\n", "\t15 & 1.0 & cadillac \\\\\n", "\t16 & 3.0 & toyota \\\\\n", "\t17 & 3.0 & datsun \\\\\n", "\t18 & 3.0 & mazda \\\\\n", "\t19 & 3.0 & toyouta \\\\\n", "\t20 & 3.0 & maxda \\\\\n", "\t21 & 3.0 & honda \\\\\n", "\t22 & 3.0 & subaru \\\\\n", "\t23 & 3.0 & nissan \\\\\n", "\t24 & 2.0 & volkswagen \\\\\n", "\t25 & 2.0 & peugeot \\\\\n", "\t26 & 2.0 & audi \\\\\n", "\t27 & 2.0 & saab \\\\\n", "\t28 & 2.0 & bmw \\\\\n", "\t29 & 2.0 & opel \\\\\n", "\t30 & 2.0 & fiat \\\\\n", "\t31 & 2.0 & volvo \\\\\n", "\t32 & 2.0 & renault \\\\\n", "\t33 & 2.0 & vw \\\\\n", "\t34 & 2.0 & mercedes-benz \\\\\n", "\t35 & 2.0 & mercedes \\\\\n", "\t36 & 2.0 & vokswagen \\\\\n", "\t37 & 2.0 & triumph \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "37×2 DataFrame\n", "│ Row │ origin │ brand_unique │\n", "│ │ \u001b[90mFloat64\u001b[39m │ \u001b[90mString\u001b[39m │\n", "├─────┼─────────┼───────────────┤\n", "│ 1 │ 1.0 │ chevrolet │\n", "│ 2 │ 1.0 │ buick │\n", "│ 3 │ 1.0 │ plymouth │\n", "│ 4 │ 1.0 │ amc │\n", "│ 5 │ 1.0 │ ford │\n", "│ 6 │ 1.0 │ pontiac │\n", "│ 7 │ 1.0 │ dodge │\n", "│ 8 │ 1.0 │ chevy │\n", "│ 9 │ 1.0 │ hi │\n", "│ 10 │ 1.0 │ mercury │\n", "│ 11 │ 1.0 │ oldsmobile │\n", "│ 12 │ 1.0 │ chrysler │\n", "│ 13 │ 1.0 │ chevroelt │\n", "│ 14 │ 1.0 │ capri │\n", "│ 15 │ 1.0 │ cadillac │\n", "│ 16 │ 3.0 │ toyota │\n", "│ 17 │ 3.0 │ datsun │\n", "│ 18 │ 3.0 │ mazda │\n", "│ 19 │ 3.0 │ toyouta │\n", "│ 20 │ 3.0 │ maxda │\n", "│ 21 │ 3.0 │ honda │\n", "│ 22 │ 3.0 │ subaru │\n", "│ 23 │ 3.0 │ nissan │\n", "│ 24 │ 2.0 │ volkswagen │\n", "│ 25 │ 2.0 │ peugeot │\n", "│ 26 │ 2.0 │ audi │\n", "│ 27 │ 2.0 │ saab │\n", "│ 28 │ 2.0 │ bmw │\n", "│ 29 │ 2.0 │ opel │\n", "│ 30 │ 2.0 │ fiat │\n", "│ 31 │ 2.0 │ volvo │\n", "│ 32 │ 2.0 │ renault │\n", "│ 33 │ 2.0 │ vw │\n", "│ 34 │ 2.0 │ mercedes-benz │\n", "│ 35 │ 2.0 │ mercedes │\n", "│ 36 │ 2.0 │ vokswagen │\n", "│ 37 │ 2.0 │ triumph │" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "@pipe df |> groupby(_, :origin) |> combine(_, :brand => unique)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we check if the groups are pairwise disjoint (this is a bit tricky, but it is much fun to be able to write such code in Julia):" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 vs 2 => String[]\n", "1 vs 3 => String[]\n", "2 vs 3 => String[]\n" ] } ], "source": [ "for i in axes(origin_brand3, 1)\n", " for j in i+1:nrow(origin_brand3)\n", " println(\"$i vs $j => \", intersect(origin_brand3.brand_function[[i,j]]...))\n", " end\n", "end" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can also easily flatten the `origin_brand3` data frame so that `brand_function` column is expanded to span multiple rows (the case we have seen above when we omitted writing `Ref` in `combine`)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

37 rows × 2 columns

originbrand_function
Float64String
11.0chevrolet
21.0buick
31.0plymouth
41.0amc
51.0ford
61.0pontiac
71.0dodge
81.0chevy
91.0hi
101.0mercury
111.0oldsmobile
121.0chrysler
131.0chevroelt
141.0capri
151.0cadillac
163.0toyota
173.0datsun
183.0mazda
193.0toyouta
203.0maxda
213.0honda
223.0subaru
233.0nissan
242.0volkswagen
252.0peugeot
262.0audi
272.0saab
282.0bmw
292.0opel
302.0fiat
312.0volvo
322.0renault
332.0vw
342.0mercedes-benz
352.0mercedes
362.0vokswagen
372.0triumph
" ], "text/latex": [ "\\begin{tabular}{r|cc}\n", "\t& origin & brand\\_function\\\\\n", "\t\\hline\n", "\t& Float64 & String\\\\\n", "\t\\hline\n", "\t1 & 1.0 & chevrolet \\\\\n", "\t2 & 1.0 & buick \\\\\n", "\t3 & 1.0 & plymouth \\\\\n", "\t4 & 1.0 & amc \\\\\n", "\t5 & 1.0 & ford \\\\\n", "\t6 & 1.0 & pontiac \\\\\n", "\t7 & 1.0 & dodge \\\\\n", "\t8 & 1.0 & chevy \\\\\n", "\t9 & 1.0 & hi \\\\\n", "\t10 & 1.0 & mercury \\\\\n", "\t11 & 1.0 & oldsmobile \\\\\n", "\t12 & 1.0 & chrysler \\\\\n", "\t13 & 1.0 & chevroelt \\\\\n", "\t14 & 1.0 & capri \\\\\n", "\t15 & 1.0 & cadillac \\\\\n", "\t16 & 3.0 & toyota \\\\\n", "\t17 & 3.0 & datsun \\\\\n", "\t18 & 3.0 & mazda \\\\\n", "\t19 & 3.0 & toyouta \\\\\n", "\t20 & 3.0 & maxda \\\\\n", "\t21 & 3.0 & honda \\\\\n", "\t22 & 3.0 & subaru \\\\\n", "\t23 & 3.0 & nissan \\\\\n", "\t24 & 2.0 & volkswagen \\\\\n", "\t25 & 2.0 & peugeot \\\\\n", "\t26 & 2.0 & audi \\\\\n", "\t27 & 2.0 & saab \\\\\n", "\t28 & 2.0 & bmw \\\\\n", "\t29 & 2.0 & opel \\\\\n", "\t30 & 2.0 & fiat \\\\\n", "\t31 & 2.0 & volvo \\\\\n", "\t32 & 2.0 & renault \\\\\n", "\t33 & 2.0 & vw \\\\\n", "\t34 & 2.0 & mercedes-benz \\\\\n", "\t35 & 2.0 & mercedes \\\\\n", "\t36 & 2.0 & vokswagen \\\\\n", "\t37 & 2.0 & triumph \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "37×2 DataFrame\n", "│ Row │ origin │ brand_function │\n", "│ │ \u001b[90mFloat64\u001b[39m │ \u001b[90mString\u001b[39m │\n", "├─────┼─────────┼────────────────┤\n", "│ 1 │ 1.0 │ chevrolet │\n", "│ 2 │ 1.0 │ buick │\n", "│ 3 │ 1.0 │ plymouth │\n", "│ 4 │ 1.0 │ amc │\n", "│ 5 │ 1.0 │ ford │\n", "│ 6 │ 1.0 │ pontiac │\n", "│ 7 │ 1.0 │ dodge │\n", "│ 8 │ 1.0 │ chevy │\n", "│ 9 │ 1.0 │ hi │\n", "│ 10 │ 1.0 │ mercury │\n", "│ 11 │ 1.0 │ oldsmobile │\n", "│ 12 │ 1.0 │ chrysler │\n", "│ 13 │ 1.0 │ chevroelt │\n", "│ 14 │ 1.0 │ capri │\n", "│ 15 │ 1.0 │ cadillac │\n", "│ 16 │ 3.0 │ toyota │\n", "│ 17 │ 3.0 │ datsun │\n", "│ 18 │ 3.0 │ mazda │\n", "│ 19 │ 3.0 │ toyouta │\n", "│ 20 │ 3.0 │ maxda │\n", "│ 21 │ 3.0 │ honda │\n", "│ 22 │ 3.0 │ subaru │\n", "│ 23 │ 3.0 │ nissan │\n", "│ 24 │ 2.0 │ volkswagen │\n", "│ 25 │ 2.0 │ peugeot │\n", "│ 26 │ 2.0 │ audi │\n", "│ 27 │ 2.0 │ saab │\n", "│ 28 │ 2.0 │ bmw │\n", "│ 29 │ 2.0 │ opel │\n", "│ 30 │ 2.0 │ fiat │\n", "│ 31 │ 2.0 │ volvo │\n", "│ 32 │ 2.0 │ renault │\n", "│ 33 │ 2.0 │ vw │\n", "│ 34 │ 2.0 │ mercedes-benz │\n", "│ 35 │ 2.0 │ mercedes │\n", "│ 36 │ 2.0 │ vokswagen │\n", "│ 37 │ 2.0 │ triumph │" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "flatten(origin_brand3, :brand_function)" ] } ], "metadata": { "kernelspec": { "display_name": "Julia 1.4.1", "language": "julia", "name": "julia-1.4" }, "language_info": { "file_extension": ".jl", "mimetype": "application/julia", "name": "julia", "version": "1.4.1" } }, "nbformat": 4, "nbformat_minor": 4 }