Taller de Carga y Exploración Simple de de Datos¶

In [1]:

            
                Copied!
                
                    
                    
                
                

        
try:
    import google.colab
    !pip uninstall matplotlib -y
    !pip install -q condacolab
    !apt-get install unrar-free
    
    import condacolab
    condacolab.install_mambaforge()
except ModuleNotFoundError:
    pass
try:
    import google.colab
    !pip uninstall matplotlib -y
    !pip install -q condacolab
    !apt-get install unrar-free
    
    import condacolab
    condacolab.install_mambaforge()
except ModuleNotFoundError:
    pass

In [2]:

            
                Copied!
                
                    
                    
                
                

        
try:
    import google.colab
    !git clone https://github.com/zorzalerrante/aves.git aves_git
    !mamba env update --name base --file aves_git/environment-colab.yml
except ModuleNotFoundError:
    pass
try:
    import google.colab
    !git clone https://github.com/zorzalerrante/aves.git aves_git
    !mamba env update --name base --file aves_git/environment-colab.yml
except ModuleNotFoundError:
    pass

In [3]:

            
                Copied!
                
                    
                    
                
                

        
try:
    import google.colab
    !cd aves_git && make download-external
    !cd ..
except ModuleNotFoundError:
    pass
try:
    import google.colab
    !cd aves_git && make download-external
    !cd ..
except ModuleNotFoundError:
    pass

In [4]:

            
                Copied!
                
import sys
from pathlib import Path

AVES_ROOT = Path("..") if not 'google.colab' in sys.modules else Path("aves_git")
DATA_PATH = AVES_ROOT / "data" / "external"
DATA_PATH
import sys
from pathlib import Path

AVES_ROOT = Path("..") if not 'google.colab' in sys.modules else Path("aves_git")
DATA_PATH = AVES_ROOT / "data" / "external"
DATA_PATH

Out[4]:

PosixPath('../data/external')

In [5]:

            
                Copied!
                
                    
                    
                
                

        
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from gensim.utils import deaccent

sns.set(
    style="whitegrid", context="paper", font_scale=0.8, font="Fira Sans Extra Condensed"
)
mpl.rcParams["figure.dpi"] = 120
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from gensim.utils import deaccent

sns.set(
    style="whitegrid", context="paper", font_scale=0.8, font="Fira Sans Extra Condensed"
)
mpl.rcParams["figure.dpi"] = 120

Para entender la CASEN:

http://observatorio.ministeriodesarrollosocial.gob.cl/storage/docs/casen/2017/Libro_de_Codigos_Casen_2017.pdf http://observatorio.ministeriodesarrollosocial.gob.cl/storage/docs/casen/2020/Libro_de_codigos_Base_de_Datos_Casen_en_Pandemia_2020.pdf

Códigos de comunas: http://www.subdere.gov.cl/documentacion/c%C3%B3digos-%C3%BAnicos-territoriales-actualizados-al-06-de-septiembre-2018

In [6]:

            
                Copied!
                
casen = pd.read_stata(DATA_PATH / "casen_2017/Casen 2017.dta", convert_categoricals=False)
casen.head()
casen = pd.read_stata(DATA_PATH / "casen_2017/Casen 2017.dta", convert_categoricals=False)
casen.head()

Out[6]:

	folio	o	id_vivienda	hogar	region	provincia	comuna	zona	expr	expc	...	hh_d_estado	hh_d_servbas	hh_d_entorno	hh_d_medio	hh_d_appart	hh_d_tsocial	hh_d_seg	pobreza_multi_4d	pobreza_multi_5d
0	1.101100e+11	1.0	1.101100e+09	1.0	1.0	11.0	1101.0	1.0	39.0	33.0	...	0.0	1.0	1.0	1.0	0.0	0.0	1.0	NaN	NaN
1	1.101100e+11	1.0	1.101100e+09	1.0	1.0	11.0	1101.0	1.0	39.0	33.0	...	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	1.0
2	1.101100e+11	1.0	1.101100e+09	1.0	1.0	11.0	1101.0	1.0	39.0	33.0	...	1.0	0.0	1.0	1.0	0.0	1.0	1.0	NaN	NaN
3	1.101100e+11	2.0	1.101100e+09	1.0	1.0	11.0	1101.0	1.0	39.0	33.0	...	1.0	0.0	1.0	1.0	0.0	1.0	1.0	NaN	NaN
4	1.101100e+11	1.0	1.101100e+09	1.0	1.0	11.0	1101.0	1.0	39.0	33.0	...	1.0	1.0	1.0	1.0	NaN	1.0	0.0	0.0	NaN

5 rows × 804 columns

In [7]:

            
                Copied!
                
comunas = casen["comuna"].unique().astype(int)
comunas
comunas = casen["comuna"].unique().astype(int)
comunas

Out[7]:

array([ 1101,  1107,  1401,  1402,  1404,  1405,  2101,  2102,  2103,
        2104,  2201,  2203,  2301,  2302,  3101,  3102,  3103,  3201,
        3202,  3301,  3302,  3303,  3304,  4101,  4102,  4103,  4104,
        4105,  4106,  4201,  4202,  4203,  4204,  4301,  4302,  4303,
        4304,  4305,  5101,  5102,  5103,  5105,  5107,  5109,  5301,
        5302,  5303,  5304,  5401,  5402,  5403,  5404,  5405,  5501,
        5502,  5503,  5504,  5506,  5601,  5602,  5603,  5604,  5605,
        5606,  5701,  5702,  5703,  5704,  5705,  5706,  5801,  5802,
        5803,  5804,  6101,  6102,  6103,  6104,  6105,  6106,  6107,
        6108,  6109,  6110,  6111,  6112,  6113,  6114,  6115,  6116,
        6117,  6201,  6202,  6203,  6204,  6205,  6206,  6301,  6302,
        6303,  6304,  6305,  6306,  6307,  6308,  6309,  6310,  7101,
        7102,  7103,  7104,  7105,  7106,  7107,  7108,  7109,  7110,
        7201,  7202,  7203,  7301,  7302,  7303,  7304,  7305,  7306,
        7307,  7308,  7309,  7401,  7402,  7403,  7404,  7405,  7406,
        7407,  7408,  8101,  8102,  8103,  8104,  8105,  8106,  8107,
        8108,  8109,  8110,  8111,  8112,  8201,  8202,  8203,  8204,
        8205,  8206,  8207,  8301,  8302,  8303,  8304,  8305,  8306,
        8307,  8308,  8309,  8310,  8311,  8312,  8313,  8314, 16101,
       16102, 16202, 16203, 16302, 16103, 16104, 16204, 16303, 16105,
       16106, 16205, 16107, 16201, 16206, 16301, 16304, 16108, 16305,
       16207, 16109,  9101,  9102,  9103,  9104,  9105,  9106,  9107,
        9108,  9109,  9110,  9111,  9112,  9113,  9114,  9115,  9116,
        9117,  9118,  9119,  9120,  9121,  9201,  9202,  9203,  9204,
        9205,  9206,  9207,  9208,  9209,  9210,  9211, 10101, 10102,
       10104, 10105, 10106, 10107, 10108, 10109, 10201, 10202, 10203,
       10204, 10205, 10206, 10207, 10208, 10209, 10210, 10301, 10302,
       10303, 10304, 10305, 10306, 10307, 11101, 11201, 11202, 11301,
       11401, 11402, 12101, 12301, 12401, 13101, 13102, 13103, 13104,
       13105, 13106, 13107, 13108, 13109, 13110, 13111, 13112, 13113,
       13114, 13115, 13116, 13117, 13118, 13119, 13120, 13121, 13122,
       13123, 13124, 13125, 13126, 13127, 13128, 13129, 13130, 13131,
       13132, 13201, 13202, 13203, 13301, 13302, 13303, 13401, 13402,
       13403, 13404, 13501, 13502, 13503, 13504, 13505, 13601, 13602,
       13603, 13604, 13605, 14101, 14102, 14103, 14104, 14105, 14106,
       14107, 14108, 14201, 14202, 14203, 14204, 15101, 15102, 15201])

In [8]:

            
                Copied!
                
len(comunas)
len(comunas)

Out[8]:

In [9]:

            
                Copied!
                
casen_2020 = pd.read_stata(
    DATA_PATH / "casen_2020/Casen en Pandemia 2020 STATA.dta"
)
casen_2020.head()
casen_2020 = pd.read_stata(
    DATA_PATH / "casen_2020/Casen en Pandemia 2020 STATA.dta"
)
casen_2020.head()

/home/egraells/miniconda3/envs/aves/lib/python3.8/site-packages/pandas/io/stata.py:1457: UnicodeWarning: 
One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  warnings.warn(msg, UnicodeWarning)

Out[9]:

	folio	o	id_persona	id_vivienda	region	provincia	comuna	zona	area	segmento	...	v27	v28	v29	hacinamiento	nhog_viv	indsan	ten_viv	ten_viv_f	iae	iai
0	1.101100e+11	1	5	1.101100e+09	Región de Tarapacá	Iquique	Iquique	Urbano	Urbano	11011001	...	1	NaN	NaN	Sin hacinamiento (menos de 2,5 personas por do...	1	Deficitario	Arrendada	Tenencia informal	Sin allegamiento externo	sin allegamiento interno
1	1.101100e+11	2	6	1.101100e+09	Región de Tarapacá	Iquique	Iquique	Urbano	Urbano	11011001	...	1	NaN	NaN	Sin hacinamiento (menos de 2,5 personas por do...	1	Deficitario	Arrendada	Tenencia informal	Sin allegamiento externo	sin allegamiento interno
2	1.101100e+11	2	31	1.101100e+09	Región de Tarapacá	Iquique	Iquique	Urbano	Urbano	11011001	...	1	NaN	NaN	Hacinamiento medio (entre 2,5 y 3,49 personas ...	1	Aceptable	Arrendada	Tenencia formal	Sin allegamiento externo	sin allegamiento interno
3	1.101100e+11	1	32	1.101100e+09	Región de Tarapacá	Iquique	Iquique	Urbano	Urbano	11011001	...	1	NaN	NaN	Hacinamiento medio (entre 2,5 y 3,49 personas ...	1	Aceptable	Arrendada	Tenencia formal	Sin allegamiento externo	sin allegamiento interno
4	1.101100e+11	3	30	1.101100e+09	Región de Tarapacá	Iquique	Iquique	Urbano	Urbano	11011001	...	1	NaN	NaN	Hacinamiento medio (entre 2,5 y 3,49 personas ...	1	Aceptable	Arrendada	Tenencia formal	Sin allegamiento externo	sin allegamiento interno

5 rows × 650 columns

In [10]:

            
                Copied!
                
elecciones = pd.read_csv(
    DATA_PATH / "presidenciales_2021/Servel_20211121_PRESIDENCIALES_CHILE.csv",
    sep=";",
)
elecciones.head()
elecciones = pd.read_csv(
    DATA_PATH / "presidenciales_2021/Servel_20211121_PRESIDENCIALES_CHILE.csv",
    sep=";",
)
elecciones.head()

Out[10]:

	eleccion_id	eleccion_nombre	region_id	region_nombre	provincia_id	provincia_nombre	comuna_id	comuna_nombre	circele_id	circele_nombre	...	codigo_colegio	nombre_colegio	sede_colegio	cs_id	cs	distrito_id	distrito	lista	pacto	partido_politico
0	4	Elección de Presidente 2021 ...	3001	DE TARAPACA ...	4011	IQUIQUE ...	2501	IQUIQUE ...	7007	IQUIQUE ...	...	101	LICEO DEPORTIVO ELENA DUVAUCHELLE CABEZON ...	ANIBAL PINTO 955 ...	5002	CIRCUNSCRIPCIÓN SENATORIAL 2 ...	6002	DISTRITO 2 ...	...	SIN PACTO ...	CONVERGENCIA SOCIAL ...
1	4	Elección de Presidente 2021 ...	3001	DE TARAPACA ...	4011	IQUIQUE ...	2501	IQUIQUE ...	7007	IQUIQUE ...	...	101	LICEO DEPORTIVO ELENA DUVAUCHELLE CABEZON ...	ANIBAL PINTO 955 ...	5002	CIRCUNSCRIPCIÓN SENATORIAL 2 ...	6002	DISTRITO 2 ...	...	SIN PACTO ...	PARTIDO REPUBLICANO DE CHILE ...
2	4	Elección de Presidente 2021 ...	3001	DE TARAPACA ...	4011	IQUIQUE ...	2501	IQUIQUE ...	7007	IQUIQUE ...	...	101	LICEO DEPORTIVO ELENA DUVAUCHELLE CABEZON ...	ANIBAL PINTO 955 ...	5002	CIRCUNSCRIPCIÓN SENATORIAL 2 ...	6002	DISTRITO 2 ...	...	SIN PACTO ...	PARTIDO DEMOCRATA CRISTIANO ...
3	4	Elección de Presidente 2021 ...	3001	DE TARAPACA ...	4011	IQUIQUE ...	2501	IQUIQUE ...	7007	IQUIQUE ...	...	101	LICEO DEPORTIVO ELENA DUVAUCHELLE CABEZON ...	ANIBAL PINTO 955 ...	5002	CIRCUNSCRIPCIÓN SENATORIAL 2 ...	6002	DISTRITO 2 ...	...	SIN PACTO ...	EVOLUCION POLITICA ...
4	4	Elección de Presidente 2021 ...	3001	DE TARAPACA ...	4011	IQUIQUE ...	2501	IQUIQUE ...	7007	IQUIQUE ...	...	101	LICEO DEPORTIVO ELENA DUVAUCHELLE CABEZON ...	ANIBAL PINTO 955 ...	5002	CIRCUNSCRIPCIÓN SENATORIAL 2 ...	6002	DISTRITO 2 ...	...	SIN PACTO ...	UNION PATRIOTICA ...

5 rows × 34 columns

In [11]:

            
                Copied!
                
codes = pd.read_excel(
    DATA_PATH / "CUT_2018_v04.xls"
)
codes
codes = pd.read_excel(
    DATA_PATH / "CUT_2018_v04.xls"
)
codes

Out[11]:

	Código Región	Nombre Región	Abreviatura Región	Código Provincia	Nombre Provincia	Código Comuna 2018	Nombre Comuna
0	1	Tarapacá	TPCA	11	Iquique	1101	Iquique
1	1	Tarapacá	TPCA	11	Iquique	1107	Alto Hospicio
2	1	Tarapacá	TPCA	14	Tamarugal	1401	Pozo Almonte
3	1	Tarapacá	TPCA	14	Tamarugal	1402	Camiña
4	1	Tarapacá	TPCA	14	Tamarugal	1403	Colchane
...	...	...	...	...	...	...	...
341	16	Ñuble	NUBLE	163	Punilla	16301	San Carlos
342	16	Ñuble	NUBLE	163	Punilla	16302	Coihueco
343	16	Ñuble	NUBLE	163	Punilla	16303	Ñiquén
344	16	Ñuble	NUBLE	163	Punilla	16304	San Fabián
345	16	Ñuble	NUBLE	163	Punilla	16305	San Nicolás

346 rows × 7 columns

In [12]:

            
                Copied!
                
elecciones.columns
elecciones.columns

Out[12]:

Index(['eleccion_id', 'eleccion_nombre', 'region_id', 'region_nombre',
       'provincia_id', 'provincia_nombre', 'comuna_id', 'comuna_nombre',
       'circele_id', 'circele_nombre', 'local_id', 'local_nombre', 'mesa_id',
       'mesas_fusionadas', 'mesa_electores', 'mesa_numero', 'tipomesa',
       'mesa_descuadrada_preliminar', 'mesa_descuadrada_provisorio',
       'candidato', 'votos_preliminar_string', 'votos_provisorio_string',
       'nvoto', 'electo', 'codigo_colegio', 'nombre_colegio', 'sede_colegio',
       'cs_id', 'cs', 'distrito_id', 'distrito', 'lista', 'pacto',
       'partido_politico'],
      dtype='object')

In [13]:

            
                Copied!
                
elecciones["votos_preliminar_string"].astype(int)
elecciones["votos_preliminar_string"].astype(int)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/tmp/ipykernel_20150/3751878002.py in <module>
----> 1 elecciones["votos_preliminar_string"].astype(int)

~/miniconda3/envs/aves/lib/python3.8/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors)
   5813         else:
   5814             # else, only a single dtype is given
-> 5815             new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
   5816             return self._constructor(new_data).__finalize__(self, method="astype")
   5817 

~/miniconda3/envs/aves/lib/python3.8/site-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors)
    416 
    417     def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
--> 418         return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
    419 
    420     def convert(

~/miniconda3/envs/aves/lib/python3.8/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, ignore_failures, **kwargs)
    325                     applied = b.apply(f, **kwargs)
    326                 else:
--> 327                     applied = getattr(b, f)(**kwargs)
    328             except (TypeError, NotImplementedError):
    329                 if not ignore_failures:

~/miniconda3/envs/aves/lib/python3.8/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors)
    589         values = self.values
    590 
--> 591         new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
    592 
    593         new_values = maybe_coerce_values(new_values)

~/miniconda3/envs/aves/lib/python3.8/site-packages/pandas/core/dtypes/cast.py in astype_array_safe(values, dtype, copy, errors)
   1307 
   1308     try:
-> 1309         new_values = astype_array(values, dtype, copy=copy)
   1310     except (ValueError, TypeError):
   1311         # e.g. astype_nansafe can fail on object-dtype of strings

~/miniconda3/envs/aves/lib/python3.8/site-packages/pandas/core/dtypes/cast.py in astype_array(values, dtype, copy)
   1255 
   1256     else:
-> 1257         values = astype_nansafe(values, dtype, copy=copy)
   1258 
   1259     # in pandas we don't store numpy str dtypes, so convert to object

~/miniconda3/envs/aves/lib/python3.8/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna)
   1172         # work around NumPy brokenness, #1987
   1173         if np.issubdtype(dtype.type, np.integer):
-> 1174             return lib.astype_intsafe(arr, dtype)
   1175 
   1176         # if we have a datetime/timedelta array of objects

~/miniconda3/envs/aves/lib/python3.8/site-packages/pandas/_libs/lib.pyx in pandas._libs.lib.astype_intsafe()

ValueError: invalid literal for int() with base 10: '                                                                                                    '

In [14]:

            
                Copied!
                
elecciones["votos_preliminar_string_strip"] = (
    elecciones["votos_preliminar_string"].str.strip().str.replace(r"^$", "0").astype(int)
)
elecciones["votos_preliminar_string_strip"] = (
    elecciones["votos_preliminar_string"].str.strip().str.replace(r"^$", "0").astype(int)
)

/tmp/ipykernel_20150/1964033267.py:2: FutureWarning: The default value of regex will change from True to False in a future version.
  elecciones["votos_preliminar_string"].str.strip().str.replace(r"^$", "0").astype(int)

In [15]:

            
                Copied!
                
elecciones["votos_preliminar_string_strip"].describe()
elecciones["votos_preliminar_string_strip"].describe()

Out[15]:

count    513029.000000
mean         41.236916
std          56.684928
min           0.000000
25%           3.000000
50%          17.000000
75%          45.000000
max         744.000000
Name: votos_preliminar_string_strip, dtype: float64

In [16]:

            
                Copied!
                
elecciones.groupby("candidato")["votos_preliminar_string_strip"].sum()
elecciones.groupby("candidato")["votos_preliminar_string_strip"].sum()

Out[16]:

candidato
EDUARDO ARTES BRICHETTI                                                                                  102493
FRANCO PARISI FERNANDEZ                                                                                  898407
GABRIEL BORIC FONT                                                                                      1796664
JOSE ANTONIO KAST RIST                                                                                  1954053
MARCO ENRIQUEZ-OMINAMI GUMUCIO                                                                           533657
SEBASTIAN SICHEL RAMIREZ                                                                                 894800
Total Sufragios Emitidos                                                                                7036860
Total Suma Calculada                                                                                    7036860
Votos Blancos                                                                                             31256
Votos Nulos                                                                                               57266
YASNA PROVOSTE CAMPILLAY                                                                                 813418
Name: votos_preliminar_string_strip, dtype: int64

In [17]:

            
                Copied!
                
elecciones["comuna_nombre"]
elecciones["comuna_nombre"]

Out[17]:

0         IQUIQUE                                       ...
1         IQUIQUE                                       ...
2         IQUIQUE                                       ...
3         IQUIQUE                                       ...
4         IQUIQUE                                       ...
                                ...                        
513024    SAN NICOLAS                                   ...
513025    SAN NICOLAS                                   ...
513026    SAN NICOLAS                                   ...
513027    SAN NICOLAS                                   ...
513028    SAN NICOLAS                                   ...
Name: comuna_nombre, Length: 513029, dtype: object

In [18]:

            
                Copied!
                
elecciones["comuna_nombre"] = elecciones["comuna_nombre"].str.strip()
elecciones["comuna_nombre"]
elecciones["comuna_nombre"] = elecciones["comuna_nombre"].str.strip()
elecciones["comuna_nombre"]

Out[18]:

0             IQUIQUE
1             IQUIQUE
2             IQUIQUE
3             IQUIQUE
4             IQUIQUE
             ...     
513024    SAN NICOLAS
513025    SAN NICOLAS
513026    SAN NICOLAS
513027    SAN NICOLAS
513028    SAN NICOLAS
Name: comuna_nombre, Length: 513029, dtype: object

In [19]:

            
                Copied!
                
                    
                    
                
                

        
turnout = (
    elecciones.groupby(["comuna_nombre", "candidato"])["votos_preliminar_string_strip"]
    .sum()
    .unstack()
)
turnout.head()
turnout = (
    elecciones.groupby(["comuna_nombre", "candidato"])["votos_preliminar_string_strip"]
    .sum()
    .unstack()
)
turnout.head()

Out[19]:

candidato	EDUARDO ARTES BRICHETTI	FRANCO PARISI FERNANDEZ	GABRIEL BORIC FONT	JOSE ANTONIO KAST RIST	MARCO ENRIQUEZ-OMINAMI GUMUCIO	SEBASTIAN SICHEL RAMIREZ	Total Sufragios Emitidos	Total Suma Calculada	Votos Blancos	Votos Nulos	YASNA PROVOSTE CAMPILLAY
comuna_nombre
ALGARROBO	90	568	1978	2877	472	1352	8217	8217	20	43	817
ALHUE	31	357	636	467	151	320	2517	2517	16	19	520
ALTO BIOBIO	40	255	377	502	119	155	1997	1997	80	81	399
ALTO DEL CARMEN	11	365	260	265	56	87	1966	1966	14	15	891
ALTO HOSPICIO	263	9884	3183	5860	1712	1212	24579	24579	70	173	2212

In [20]:

            
                Copied!
                
turnout.columns
turnout.columns

Out[20]:

Index(['EDUARDO ARTES BRICHETTI                                                                             ',
       'FRANCO PARISI FERNANDEZ                                                                             ',
       'GABRIEL BORIC FONT                                                                                  ',
       'JOSE ANTONIO KAST RIST                                                                              ',
       'MARCO ENRIQUEZ-OMINAMI GUMUCIO                                                                      ',
       'SEBASTIAN SICHEL RAMIREZ                                                                            ',
       'Total Sufragios Emitidos                                                                            ',
       'Total Suma Calculada                                                                                ',
       'Votos Blancos                                                                                       ',
       'Votos Nulos                                                                                         ',
       'YASNA PROVOSTE CAMPILLAY                                                                            '],
      dtype='object', name='candidato')

In [21]:

            
                Copied!
                
turnout.columns = list(map(lambda x: x.strip(), turnout.columns))
turnout.columns
turnout.columns = list(map(lambda x: x.strip(), turnout.columns))
turnout.columns

Out[21]:

Index(['EDUARDO ARTES BRICHETTI', 'FRANCO PARISI FERNANDEZ',
       'GABRIEL BORIC FONT', 'JOSE ANTONIO KAST RIST',
       'MARCO ENRIQUEZ-OMINAMI GUMUCIO', 'SEBASTIAN SICHEL RAMIREZ',
       'Total Sufragios Emitidos', 'Total Suma Calculada', 'Votos Blancos',
       'Votos Nulos', 'YASNA PROVOSTE CAMPILLAY'],
      dtype='object')

In [22]:

            
                Copied!
                
turnout.sort_values("YASNA PROVOSTE CAMPILLAY", ascending=False)
turnout.sort_values("YASNA PROVOSTE CAMPILLAY", ascending=False)

Out[22]:

	EDUARDO ARTES BRICHETTI	FRANCO PARISI FERNANDEZ	GABRIEL BORIC FONT	JOSE ANTONIO KAST RIST	MARCO ENRIQUEZ-OMINAMI GUMUCIO	SEBASTIAN SICHEL RAMIREZ	Total Sufragios Emitidos	Total Suma Calculada	Votos Blancos	Votos Nulos	YASNA PROVOSTE CAMPILLAY
comuna_nombre
MAIPU	4173	17651	75646	42681	20921	25947	206897	206897	600	1631	19369
PUENTE ALTO	4072	19335	68855	33878	22293	23319	190892	190892	546	1745	18237
LA FLORIDA	3003	10735	58319	33562	13959	23602	159536	159536	431	1081	16861
VIÑA DEL MAR	2319	13865	41768	47235	9966	20935	150401	150401	398	967	13568
VALPARAISO	3297	13490	45641	30453	11383	12314	129972	129972	438	1126	12912
...	...	...	...	...	...	...	...	...	...	...	...
COLCHANE	4	433	17	419	5	10	915	915	1	8	18
RIO VERDE	0	31	52	101	29	26	258	258	0	2	17
TIMAUKEL	1	22	24	71	23	19	171	171	1	2	8
SAN GREGORIO	0	33	52	95	27	32	249	249	3	2	5
ANTARTICA	0	4	4	34	2	2	47	47	0	0	1

346 rows × 11 columns

In [23]:

            
                Copied!
                
codes["comuna_nombre"] = codes["Nombre Comuna"].str.upper().map(deaccent)
codes["comuna_nombre"]
codes["comuna_nombre"] = codes["Nombre Comuna"].str.upper().map(deaccent)
codes["comuna_nombre"]

Out[23]:

0            IQUIQUE
1      ALTO HOSPICIO
2       POZO ALMONTE
3             CAMINA
4           COLCHANE
           ...      
341       SAN CARLOS
342         COIHUECO
343           NIQUEN
344       SAN FABIAN
345      SAN NICOLAS
Name: comuna_nombre, Length: 346, dtype: object

In [24]:

            
                Copied!
                
casen_comunas = casen.join(
    codes.set_index("Código Comuna 2018")["comuna_nombre"], on="comuna"
)
casen_comunas
casen_comunas = casen.join(
    codes.set_index("Código Comuna 2018")["comuna_nombre"], on="comuna"
)
casen_comunas

Out[24]:

	folio	o	id_vivienda	hogar	region	provincia	comuna	zona	expr	expc	...	hh_d_servbas	hh_d_entorno	hh_d_accesi	hh_d_medio	hh_d_appart	hh_d_tsocial	hh_d_seg	pobreza_multi_4d	pobreza_multi_5d	comuna_nombre
0	1.101100e+11	1.0	1.101100e+09	1.0	1.0	11.0	1101.0	1.0	39.0	33.0	...	1.0	1.0	0.0	1.0	0.0	0.0	1.0	NaN	NaN	IQUIQUE
1	1.101100e+11	1.0	1.101100e+09	1.0	1.0	11.0	1101.0	1.0	39.0	33.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	1.0	IQUIQUE
2	1.101100e+11	1.0	1.101100e+09	1.0	1.0	11.0	1101.0	1.0	39.0	33.0	...	0.0	1.0	0.0	1.0	0.0	1.0	1.0	NaN	NaN	IQUIQUE
3	1.101100e+11	2.0	1.101100e+09	1.0	1.0	11.0	1101.0	1.0	39.0	33.0	...	0.0	1.0	0.0	1.0	0.0	1.0	1.0	NaN	NaN	IQUIQUE
4	1.101100e+11	1.0	1.101100e+09	1.0	1.0	11.0	1101.0	1.0	39.0	33.0	...	1.0	1.0	0.0	1.0	NaN	1.0	0.0	0.0	NaN	IQUIQUE
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
216434	1.520130e+12	1.0	1.520130e+10	1.0	15.0	152.0	15201.0	2.0	7.0	4.0	...	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	1.0	PUTRE
216435	1.520130e+12	2.0	1.520130e+10	1.0	15.0	152.0	15201.0	2.0	7.0	4.0	...	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	1.0	PUTRE
216436	1.520130e+12	1.0	1.520130e+10	1.0	15.0	152.0	15201.0	2.0	7.0	4.0	...	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	1.0	PUTRE
216437	1.520130e+12	2.0	1.520130e+10	1.0	15.0	152.0	15201.0	2.0	7.0	4.0	...	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	1.0	PUTRE
216438	1.520130e+12	1.0	1.520130e+10	1.0	15.0	152.0	15201.0	2.0	7.0	4.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	PUTRE

216439 rows × 805 columns

In [25]:

            
                Copied!
                
casen_2020["comuna_nombre"] = casen_2020["comuna"].str.upper().map(deaccent)
casen_2020["comuna_nombre"] = casen_2020["comuna"].str.upper().map(deaccent)

/tmp/ipykernel_20150/3108905105.py:1: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`
  casen_2020["comuna_nombre"] = casen_2020["comuna"].str.upper().map(deaccent)

In [26]:

            
                Copied!
                
                    
                    
                
                

        
candidatos = [
    "EDUARDO ARTES BRICHETTI",
    "GABRIEL BORIC FONT",
    "JOSE ANTONIO KAST RIST",
    "MARCO ENRIQUEZ-OMINAMI GUMUCIO",
    "FRANCO PARISI FERNANDEZ",
    "YASNA PROVOSTE CAMPILLAY",
    "SEBASTIAN SICHEL RAMIREZ",
]
candidatos = [
    "EDUARDO ARTES BRICHETTI",
    "GABRIEL BORIC FONT",
    "JOSE ANTONIO KAST RIST",
    "MARCO ENRIQUEZ-OMINAMI GUMUCIO",
    "FRANCO PARISI FERNANDEZ",
    "YASNA PROVOSTE CAMPILLAY",
    "SEBASTIAN SICHEL RAMIREZ",
]

In [27]:

            
                Copied!
                
turnout = (
    turnout.reset_index()
    .assign(comuna_nombre=lambda x: x["comuna_nombre"].map(deaccent))
    .set_index("comuna_nombre")[candidatos]
)
turnout = (
    turnout.reset_index()
    .assign(comuna_nombre=lambda x: x["comuna_nombre"].map(deaccent))
    .set_index("comuna_nombre")[candidatos]
)

In [28]:

            
                Copied!
                
                    
                    
                
                

        
comunas_x_edad = (
    casen_2020[casen_2020["edad"] >= 17]
    .assign(
        grupo_etareo=lambda x: pd.cut(
            x["edad"], bins=[17, 25, 35, 45, 55, 65, 75, 85, 95, 120]
        )
    )
    .groupby(["comuna_nombre", "grupo_etareo"])["expr"]
    .sum()
    .unstack(fill_value=0)
)
comunas_x_edad
comunas_x_edad = (
    casen_2020[casen_2020["edad"] >= 17]
    .assign(
        grupo_etareo=lambda x: pd.cut(
            x["edad"], bins=[17, 25, 35, 45, 55, 65, 75, 85, 95, 120]
        )
    )
    .groupby(["comuna_nombre", "grupo_etareo"])["expr"]
    .sum()
    .unstack(fill_value=0)
)
comunas_x_edad

/home/egraells/miniconda3/envs/aves/lib/python3.8/site-packages/pandas/core/frame.py:4486: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`
  data[k] = com.apply_if_callable(v, data)

Out[28]:

grupo_etareo	(17, 25]	(25, 35]	(35, 45]	(45, 55]	(55, 65]	(65, 75]	(75, 85]	(85, 95]	(95, 120]
comuna_nombre
ALGARROBO	1662.0	2288.0	1701.0	2067.0	1725.0	1805.0	687.0	278.0	12.0
ALHUE	812.0	1131.0	668.0	976.0	988.0	361.0	225.0	124.0	0.0
ALTO BIOBIO	751.0	679.0	691.0	587.0	839.0	110.0	112.0	8.0	0.0
ALTO DEL CARMEN	636.0	820.0	1056.0	806.0	1212.0	414.0	52.0	86.0	0.0
ALTO HOSPICIO	20079.0	22230.0	18273.0	14513.0	11515.0	5085.0	1348.0	594.0	0.0
...	...	...	...	...	...	...	...	...	...
VITACURA	9359.0	6887.0	9910.0	9325.0	7185.0	5407.0	3907.0	979.0	210.0
YERBAS BUENAS	1910.0	1818.0	2231.0	1887.0	2256.0	1770.0	449.0	142.0	0.0
YUMBEL	2192.0	1701.0	2075.0	3104.0	3043.0	1610.0	1523.0	122.0	0.0
YUNGAY	3170.0	2805.0	2757.0	2288.0	2293.0	1982.0	1708.0	62.0	0.0
ZAPALLAR	1160.0	1268.0	721.0	1097.0	1444.0	502.0	309.0	150.0	0.0

324 rows × 9 columns

In [30]:

            
                Copied!
                
                    
                    
                
                

        
from aves.features.utils import normalize_rows
import seaborn as sns

normalized_turnout = turnout.pipe(normalize_rows)
normalized_edad = comunas_x_edad.pipe(normalize_rows)


sns.clustermap(
    normalized_turnout.join(normalized_edad)
    .corr()
    .loc[normalized_turnout.columns, normalized_edad.columns],
    center=0,
    annot=True,
    col_cluster=False,
)
from aves.features.utils import normalize_rows
import seaborn as sns

normalized_turnout = turnout.pipe(normalize_rows)
normalized_edad = comunas_x_edad.pipe(normalize_rows)


sns.clustermap(
    normalized_turnout.join(normalized_edad)
    .corr()
    .loc[normalized_turnout.columns, normalized_edad.columns],
    center=0,
    annot=True,
    col_cluster=False,
)

Out[30]:

<seaborn.matrix.ClusterGrid at 0x7fc89cac7730>

Propuesto: hacer algo similar con la CASEN 2017 (tabla casen_comunas).

In [ ]: