annuaire/scripts/finess-clean.py

# -*- coding: utf-8 -*-
# ---
# jupyter:
#   jupytext:
#     formats: ipynb,py:hydrogen
#     text_representation:
#       extension: .py
#       format_name: hydrogen
#       format_version: '1.3'
#       jupytext_version: 1.14.1
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# %% [markdown]
# # Production d'un csv utilisable de la base FINESS
#
# En l'état, l'export CSV de la [base FINESS][finess] n'est pas vraiment satisfaisant et utilisable.
#
# - Le fichier n'est pas réellement un CSV.
#     - Il est bizarrement découpé en deux sections qui correspondent au XML.
#     - Les colonnes n'ont pas de nom.
# - Le fichier est encodé au format windows.
#
# [finess]: https://www.data.gouv.fr/en/datasets/finess-extraction-du-fichier-des-etablissements/

# %% gradient={"editing": false, "id": "4facc182", "kernelId": ""}
import pandas as pd
import numpy as np
import requests

# %% gradient={"editing": false, "id": "3f7b5d32", "kernelId": ""}
dataset_api = "https://www.data.gouv.fr/api/1/datasets/finess-extraction-du-fichier-des-etablissements/"

# %% gradient={"editing": false, "id": "58d641d4", "kernelId": ""}
resources = (requests
    .get(dataset_api)
    .json()
    ['resources']
)

resource_geoloc = [ r for r in resources if r['type'] == 'main' and 'géolocalisés' in r['title']][0]

# %% gradient={"editing": false, "id": "13dd939b", "kernelId": ""}
headers = [
    'section',
    'nofinesset',
    'nofinessej',
    'rs',
    'rslongue',
    'complrs',
    'compldistrib',
    'numvoie',
    'typvoie',
    'voie',
    'compvoie',
    'lieuditbp',
    'commune',
    'departement',
    'libdepartement',
    'ligneacheminement',
    'telephone',
    'telecopie',
    'categetab',
    'libcategetab',
    'categagretab',
    'libcategagretab',
    'siret',
    'codeape',
    'codemft',
    'libmft',
    'codesph',
    'libsph',
    'dateouv',
    'dateautor',
    'maj',
    'numuai'
]

# %% gradient={"editing": false, "id": "b68dac89", "kernelId": ""}
geoloc_names = [
    'nofinesset',
    'coordxet',
    'coordyet',
    'sourcecoordet',
    'datemaj'
]

# %% gradient={"editing": false, "id": "4492d3dd", "kernelId": ""}
raw_df = (pd
    .read_csv(resource_geoloc['url'],
              sep=";", encoding="utf-8", header=None, skiprows=1,
              dtype='str',
              names=headers)
    .drop(columns=['section'])
)

raw_df

# %% gradient={"editing": false, "id": "2efc14bc", "kernelId": ""}
structures = (raw_df
    .iloc[:int(raw_df.index.size/2)]
)

structures

# %% gradient={"editing": false, "id": "283be3bb", "kernelId": ""}
geolocalisations = (raw_df
    .iloc[int(raw_df.index.size/2):]
    .drop(columns=raw_df.columns[5:])
    .rename(columns=lambda x: geoloc_names[list(raw_df.columns).index(x)])
)

geolocalisations

# %% gradient={"editing": false, "id": "b54e527e", "kernelId": ""}
clean_df = (structures
    .merge(geolocalisations, on="nofinesset", how="left")
)

clean_df

# %%
clean_df.sample().T

# %%
clean_df["siret"]

# %% [markdown] gradient={"editing": false, "id": "82306369-229c-418f-9138-d753e1b71ce4", "kernelId": ""}
# ## Vérification de la qualité des données

# %% gradient={"editing": false, "id": "64975e82-5f97-4bb4-b1d3-8aed85fa37cd", "kernelId": "", "source_hidden": false} jupyter={"outputs_hidden": false}
intersection = pd.Series(np.intersect1d(structures.nofinesset.values, geolocalisations.nofinesset.values))

intersection.shape

# %% gradient={"editing": false, "id": "07e3c1cb-7032-4d83-833c-0979d2592f3c", "kernelId": "", "source_hidden": false} jupyter={"outputs_hidden": false}
only_structures = (structures
    [ ~structures.nofinesset.isin(intersection) ]
)

only_structures

# %% gradient={"editing": false, "id": "cfb13e95-b622-4d89-be56-61397dc4370e", "kernelId": "", "source_hidden": false} jupyter={"outputs_hidden": false}
only_geolocalisations = (geolocalisations
    [ ~geolocalisations.nofinesset.isin(intersection) ]
)

only_geolocalisations

# %% gradient={"editing": false, "id": "92cd9e34-74c8-454c-96d8-3c628e7b94bd", "kernelId": "", "source_hidden": false} jupyter={"outputs_hidden": false}
geolocalisations_missing = []

# %% [markdown] gradient={"editing": false, "id": "ff24d2da-6b7e-49ca-8ac9-cc1e90d32235", "kernelId": ""}
# ## Export final

# %% gradient={"editing": false, "id": "8f6f3c73-4c14-4e82-ac63-cdf9ab8e4b21", "kernelId": "", "source_hidden": false} jupyter={"outputs_hidden": false}
clean_df.to_csv('finess-clean.csv', encoding='utf-8')

# %%