Source code for standardize_fr

# -*- coding: utf-8 -*-
"""
Created on Thu 16 Dec 2021

@author: Alexandre Kenshilik Coche
@contact: alexandre.co@hotmail.fr

This module is a collection of tools for manipulating hydrological space-time
data, especially netcdf data. It has been originally developped to provide
preprocessing tools for CWatM (https://cwatm.iiasa.ac.at/) and HydroModPy
(https://gitlab.com/Alex-Gauvain/HydroModPy), but most functions have been
designed to be of general use.

"""

#%% Imports:
import xarray as xr
xr.set_options(keep_attrs = True)
import rioxarray as rio #Not necessary, the rio module from xarray is enough
import pandas as pd
import geopandas as gpd
import numpy as np
import rasterio
import rasterio.features
from affine import Affine
from shapely.geometry import Point
# from shapely.geometry import Polygon
# from shapely.geometry import mapping
import os
import re
import sys
import gc # garbage collector
from pathlib import Path
# import matplotlib.pyplot as plt
import fiona
import datetime

from geop4th import geobricks as geo

# ========== see reproject() §Rasterize ======================================
# import geocube
# from geocube.api.core import make_geocube
# from geocube.rasterize import rasterize_points_griddata, rasterize_points_radial
# import functools
# =============================================================================

# import whitebox
# wbt = whitebox.WhiteboxTools()
# wbt.verbose = False

#%% LEGENDE: 
# ---- ° = à garder mais mettre à jour
# ---- * = à inclure dans une autre fonction ou supprimer

#%% LIST
def list():
    excluded_funcs = {
        'georef',
        'convert_to_cwatm',
        'shortname',
        'remove_double',
        'name_xml_attributes',
        'standardize_data',
        'by_name',
        'list',
        '',
        }
    
    temp_unfinished_funcs = {
        'era5', 
        'drias', 
        'corine', 
        'obb', 
        'cesoso', 
        'ucsbretagne', 
        'cropcoeff', 
        'process_rht', 
        'convert_from_h5_newsurfex', 
        'convert_from_h5_oldsurfex', 
        'correct_bias', 
        'secondary_era5_climvar',
        }
# =============================================================================
#     func_list = set(dir(sys.modules[__name__])).difference(excluded_funcs)
# =============================================================================
    
# ======= 2nd try =============================================================
#     func_list = [
#         "era5",
#         "bdalti",
#         "sim2",
#         "explore2climat",
#         "explore2eau",
#         "c3s",
#         "bnpe",
#         "hydrometry",
#         "corine",
#         ]
# =============================================================================

    # Inspired from Eric Matthes' idea: https://www.mostlypython.com/building-a-checklist-of-all-functions-in-a-file/
    import importlib
    name = __name__.split(".")
    folder = ".".join(name[0:-1])
    module = ".".join([name[-1], "py"])
    contents = importlib.resources.read_text(folder, module)
    func_pattern = r".*def ([a-z0-9_]*)\("
    func_list = set(re.findall(func_pattern, contents)).difference(excluded_funcs).difference(temp_unfinished_funcs)
    
    print(f"List of {name[-1]} supported datasets:")
    print('  - ' + '()\n  - '.join(sorted(func_list)) + '()')
  
#%% GEOREFERENCING
###############################################################################
# Georef (ex-decorate_NetCDF_for_QGIS)
def georef(data, *, data_type = 'other', include_crs = True, export_opt = False, crs = None, **time_kwargs):   
    r"""
    Description
    -----------
    Il est fréquent que les données de source externe présentent des défauts
    de formattage (SCR non inclus, coordonnées non standard, incompatibilité
    avec QGIS...).
    Cette fonction permet de générer un raster ou shapefile standardisé, 
    en particulier du point de vue de ses métadonnées, facilitant ainsi les
    opérations de géotraitement mais aussi la visualisation sous QGIS.
    
    Exemple
    -------
    import geoconvert as gc
    gc.georef(data = r"D:\CWatM\raw_results\test1\modflow_watertable_monthavg.nc", 
              data_type = 'CWatM')
    
    Parametres
    ----------
    data : str or xr.Dataset (or xr.DataArray)
        Chemin d'accès au fichier à modifier
        (le fichier original ne sera pas altéré, un nouveau fichier '(...)_QGIS.nc'
         sera créé.)
    data_type : str
        Type de données :
            'modflow' | 'DRIAS-Climat 2020' | 'DRIAS-Eau 2021' \ 'SIM 2021' |
            'DRIAS-Climat 2022' \ 'Climat 2022' | 'DRIAS-Eau 2024' \ 'SIM 2024' |
            'CWatM' | 'autre' \ 'other'
            (case insensitive)
    include_crs : bool, optional
        DESCRIPTION. The default is True.
    export_opt : bool, optional
        DESCRIPTION. The default is True.
        Le NetCDF crée est directement enregistré dans le même dossier que 
        le fichier d'origine, en rajoutant 'georef' à son nom.
    crs : int, optional
        Destination CRS, only necessary when data_type == 'other' The default is None.
    **time_kwargs : 
        Arguments for ``use_standard_time`` function:
            - var : time variable name (str), optional, default None
            - infer_from : {'dims', 'coords', 'all'}, optional, default 'dims' 

    Returns
    -------
    xarray.Dataset or geopandas.GeoDataFrame. 
    
    """
    
# =============================================================================
#     if include_crs is False:
#         if crs is not None:
#             include_crs = True
# =============================================================================
    
    # ---- NetCDF de ModFlow
    # --------------------
    if data_type.casefold() == 'modflow': 
        # Load
        data_ds = geo.load_any(data, decode_coords = 'all', decode_times = False)
        
        print("\nFormatting data...")
        # Add standard attributes for coordinates (mandatory for QGIS to correctly read data)
        data_ds.x.attrs = {'standard_name': 'projection_x_coordinate',
                            'long_name': 'x coordinate of projection',
                            'units': 'Meter'}
        data_ds.y.attrs = {'standard_name': 'projection_y_coordinate',
                            'long_name': 'y coordinate of projection',
                            'units': 'Meter'}
        print("   _ Standard attributes added for coordinates x and y")
        
        # Add CRS
        data_epsg = 2154 # Lambert 93
        crs_suffix = ''
        if include_crs:
            data_ds.rio.write_crs(data_epsg, inplace = True)
            print(f'   _ Coordinates Reference System (epsg:{data_epsg}) included.')
        else:
            print(f'   _ Coordinates Reference System not included. {data_epsg} has to be manually specified in QGIS')
            crs_suffix = 'nocrs'
        
    # ========== USELESS ==========================================================
    #     data_ds = data_ds.transpose('time', 'y', 'x')
    # =============================================================================


    # ---- NetCDF de CWatM
    # Inclusion du SCR
    # ------------------
    elif data_type.casefold() == 'cwatm'.casefold():
        # Load
        data_ds = geo.load_any(data, decode_coords = 'all', decode_times = False)
        
        print("\nFormatting data...")
        # Add CRS
        data_epsg = 2154 # Lambert 93
        crs_suffix = ''
        if include_crs:
            data_ds.rio.write_crs(data_epsg, inplace = True)
            print(f'   _ Coordinates Reference System (epsg:{data_epsg}) included.')
        else:
            print(f'   _ Coordinates Reference System not included. {data_epsg} has to be manually specified in QGIS')
            crs_suffix = 'nocrs'
    
    
    # ---- NetCDF de la DRIAS-Climat 2020 et de la DRIAS-Eau 2021
    # Données SAFRAN (tas, prtot, rayonnement, vent... ) "DRIAS-2020"
    # ainsi que : 
    # Données SURFEX (evapc, drainc, runoffc, swe, swi...) "EXPLORE2-SIM2 2021"
    # ------------------------------------------------------------
    if data_type.replace(" ", "").casefold() in ['drias2020', 'drias-climat2020',
                                'sim2021', 'drias-eau2021']:         
        # Load
        data_ds = geo.load_any(decode_coords = 'all')
        
        print("\nFormating data...")
        # Create X and Y coordinates if necessary, from i and j
        list_var_lowcase = [v.casefold() for v in list(data_ds.coords) + list(data_ds.data_vars)]
        if 'x' not in list_var_lowcase:
            data_ds = data_ds.assign_coords(
                X = ('i', 52000 + data_ds.i.values*8000))
            print("   _ X values created from i")
        if 'y' not in list_var_lowcase:
            data_ds = data_ds.assign_coords(
                Y = ('j', 1609000 + data_ds.j.values*8000))
            print("   _ Y values created from j")
        # Replace X and Y as coordinates, and rename them
        data_ds = data_ds.swap_dims(i = 'X', j = 'Y')
        print("   _ Coordinates i and j replaced with X and Y")
        data_ds = data_ds.rename(X = 'x', Y = 'y')
        print("   _ Coordinates renamed as lowcase x and y [optional]")
        # Get main variable
        var = geo.main_var(data_ds)
        print(f"   _ Main variables are: {', '.join(var)}")
        # Ensure that lat, lon, i and j will be further loaded by xarray as coords
        data_ds[var].attrs['coordinates'] = 'x y i j lat lon'
        print("   _ x, y, i, j, lat, lon ensured to be read as coordinates")

# ============== USELESS ======================================================
#         # Reorder data, to ensure QGIS Mesh detects the correct data set   
#         data_ds = data_ds[[var, 'x', 'y', 'time']]
#         print("   _ Data reordered [safety]")
# =============================================================================
# ============== USELESS ======================================================
#         # To avoid conflicts with missing values
#         data_ds[var].encoding.pop('missing_value')
#         data_ds['lat'].encoding.pop('missing_value')
#         # data_ds['lat'].encoding['_FillValue'] = np.nan
#         data_ds['lon'].encoding.pop('missing_value')
#         # data_ds['lon'].encoding['_FillValue'] = np.nan
# =============================================================================
        
        # Add standard attributes for coordinates (mandatory for QGIS to correctly read data)
        data_ds.x.attrs = {'standard_name': 'projection_x_coordinate',
                            'long_name': 'x coordinate of projection',
                            'units': 'Meter'}
        data_ds.y.attrs = {'standard_name': 'projection_y_coordinate',
                            'long_name': 'y coordinate of projection',
                            'units': 'Meter'}
        print("   _ Standard attributes added for coordinates x and y")
        
        # Add CRS
        data_epsg = 27572 # Lambert zone II
        crs_suffix = ''
        if include_crs:
            data_ds.rio.write_crs(data_epsg, inplace = True)
            print(f'   _ Coordinates Reference System (epsg:{data_epsg}) included. (NB: This might alter Panoply georeferenced vizualisation)')
        else:
            print(f'   _ Coordinates Reference System not included. {data_epsg} has to be manually specified in QGIS')
            crs_suffix = 'nocrs'
         # Incompatibilité QGIS - Panoply :
         # Pour une raison inconnue, l'inclusion du CRS 27572 ("Lambert 
         # Zone II" / "NTF (Paris)" pose problème pour le géo-référencement
         # dans Panoply (c'est comme si Panoply prenait {lat = 0 ; lon = 0} 
         # comme origine de la projection). Sans 'spatial_ref' incluse dans le
         # netCDF, Panoply géo-référence correctement les données, probablement
         # en utilisant directement les variables 'lat' et 'lon'.
      
        
    # ---- NetCDF de la DRIAS-Climat 2022 
    # Données SAFRAN (tas, prtot, rayonnement, vent... ) "EXPLORE2-Climat 2022" 
    # -------------------------------------------------------------------------
    elif data_type.replace(" ", "").casefold() in ['drias2022', 'climat2022', 'drias-climat2022']:
        # Load
        data_ds = geo.load_any(data, decode_cf = False)

        print("\nFormating data...")
        # Correcting the spatial_ref
# =============================================================================
#         data_ds['LambertParisII'] = xr.DataArray(
#             data = np.array(-2147220352.0),
#             coords = {'LambertParisII': -2147220352.0},
#             attrs = {'grid_mapping_name': 'lambert_conformal_conic_1SP',
#                      'latitude_of_origin': 52.0,
#                      'central_meridian': 0.0,
#                      'false_easting': 600000.0,
#                      'false_northing': 2200000.0,
#                      'epsg': 27572,
#                      'references': 'http://www.umr-cnrm.fr/spip.php?article125&lang=en'}) 
#         crs_suffix = 'georef'
# =============================================================================
        data_epsg = 27572 # Lambert zone II
        if include_crs:
            # data_ds = data_ds.drop('LambertParisII')
            # data_ds.rio.write_crs(f'epsg:{data_epsg}', inplace = True)
            data_ds = geo.standard_grid_mapping(data_ds, data_epsg)
            print(f'   _ Coordinates Reference System (epsg:{data_epsg}) included. (NB: This might alter Panoply georeferenced vizualisation)')
            crs_suffix = ''
        else:
            print(f'   _ Coordinates Reference System not included. {data_epsg} has to be manually specified in QGIS')
            crs_suffix = 'nocrs'
        # Get main variable
        var = geo.main_var(data_ds)
        print(f"   _ Main variable are: {', '.join(var)}")
        # Ensure that lat, and lon will be further loaded by xarray as coords
        data_ds[var].encoding['coordinates'] = 'x y lat lon'
        if 'coordinates' in data_ds[var].attrs:
            data_ds[var].attrs.pop('coordinates')
        data_ds.lat.encoding['grid_mapping'] = data_ds[var].encoding['grid_mapping']
        data_ds.lon.encoding['grid_mapping'] = data_ds[var].encoding['grid_mapping']
        print("   _ x, y, lat, lon ensured to be read as coordinates [safety]")
        # Remove grid_mapping in attributes (it is already in encoding)
        if 'grid_mapping' in data_ds[var].attrs:
            data_ds[var].attrs.pop('grid_mapping')
    
    
    # ---- NetCDF de la DRIAS-Eau 2024 
    # Données SURFEX (evapc, drainc, runoffc, swe, swi...) "EXPLORE2-SIM2 2024" 
    # -------------------------------------------------------------------------
    elif data_type.replace(" ", "").replace("-", "").casefold() in [
            'sim2024', 'driaseau2024']: 
        # Load
        data_ds = geo.load_any(data, decode_cf = False)

        print("\nFormating data...")
        # Correcting the spatial_ref
# =============================================================================
#         data_ds['LambertParisII'] = xr.DataArray(
#             data = np.array(-2147220352.0),
#             coords = {'LambertParisII': -2147220352.0},
#             attrs = {'grid_mapping_name': 'lambert_conformal_conic_1SP',
#                      'latitude_of_origin': 52.0,
#                      'central_meridian': 0.0,
#                      'false_easting': 600000.0,
#                      'false_northing': 2200000.0,
#                      'epsg': 27572,
#                      'references': 'http://www.umr-cnrm.fr/spip.php?article125&lang=en'}) 
#         crs_suffix = ''
# =============================================================================
        data_epsg = 27572 # Lambert zone II
        if include_crs:
            # if ('LambertParisII' in data_ds.coords) | ('LambertParisII' in data_ds.data_vars):
            #     data_ds = data_ds.drop('LambertParisII')
            # data_ds.rio.write_crs(data_epsg, inplace = True)
            data_ds = geo.standard_grid_mapping(data_ds, data_epsg)
            print(f'   _ Coordinates Reference System (epsg:{data_epsg}) included. (NB: This might alter Panoply georeferenced vizualisation)')
            crs_suffix = ''
        else:
            print(f'   _ Coordinates Reference System not included. {data_epsg} has to be manually specified in QGIS')
            crs_suffix = 'nocrs'
        
        # Create X and Y coordinates if necessary, from i and j
        list_var_lowcase = [v.casefold() for v in list(data_ds.coords) + list(data_ds.data_vars)]
        data_ds = data_ds.assign_coords(
            X = ('x', 52000 + data_ds.x.values*8000))
        print("   _ X values corrected from erroneous x")
        data_ds = data_ds.assign_coords(
            Y = ('y', 1609000 + data_ds.y.values*8000))
        print("   _ Y values corrected from erroneous y")
        # Replace X and Y as coordinates, and rename them
        data_ds = data_ds.swap_dims(x = 'X', y = 'Y')
        print("   _ Coordinates x and y replaced with X and Y")
        data_ds = data_ds.drop(['x', 'y'])
        print("   _ Previous coordinates x and y removed")
        data_ds = data_ds.rename(X = 'x', Y = 'y')
        print("   _ Coordinates renamed as lowcase x and y [optional]")
        # Get main variable
        var = geo.main_var(data_ds)
        print(f"   _ Main variable are: {', '.join(var)}")
        
        # Ensure that lat, and lon will be further loaded by xarray as coords
        data_ds[var].encoding['coordinates'] = 'x y lat lon'
        if 'coordinates' in data_ds[var].attrs:
            data_ds[var].attrs.pop('coordinates')
        # Reporting grid_mapping to coords/vars that should not be displayed in QGIS:
        for c in ['lat', 'lon']: 
            if (c in data_ds.coords) | (c in data_ds.data_vars):
                data_ds[c].encoding['grid_mapping'] = data_ds[var].encoding['grid_mapping']
        print("   _ x, y, lat, lon ensured to be read as coordinates [safety]")
       # ======== USELESS ============================================================
       #         for c in ['lat', 'lon', 'time_bnds']:
       #             if c in data_ds.data_vars:
       #                 data_ds = data_ds.set_coords([c])
       # =============================================================================
        
       # Remove grid_mapping in attributes (it is already in encoding)
        if 'grid_mapping' in data_ds[var].attrs:
            data_ds[var].attrs.pop('grid_mapping')
        
# ============== USELESS ======================================================
#         # Reorder data, to ensure QGIS Mesh detects the correct data set   
#         data_ds = data_ds[[var, 'x', 'y', 'time']]
#         print("   _ Data reordered [safety]")
# =============================================================================
# ============== USELESS ======================================================
#         # To avoid conflicts with missing values
#         data_ds[var].encoding.pop('missing_value')
#         data_ds['lat'].encoding.pop('missing_value')
#         # data_ds['lat'].encoding['_FillValue'] = np.nan
#         data_ds['lon'].encoding.pop('missing_value')
#         # data_ds['lon'].encoding['_FillValue'] = np.nan
# =============================================================================
        
        # Add standard attributes for coordinates (mandatory for QGIS to correctly read data)
        data_ds.x.attrs = {'standard_name': 'projection_x_coordinate',
                            'long_name': 'x coordinate of projection',
                            'units': 'metre'}
        data_ds.y.attrs = {'standard_name': 'projection_y_coordinate',
                            'long_name': 'y coordinate of projection',
                            'units': 'metre'}
        print("   _ Standard attributes added for coordintes x and y")


    # ---- Autres fichiers ou variables
    # Inclusion du SCR
    # ------------------
# =============================================================================
#     elif data_type.casefold() in ['autre', 'other']:
# =============================================================================
    else:
        # Load
        data_ds = geo.load_any(data, decode_times = True, decode_coords = 'all')
        
        x_var, y_var = geo.main_space_dims(data_ds)
# ====== old standard time handling ===========================================
#         time_coord = main_time_dims(data_ds)
# =============================================================================
        
        # Standardize spatial coords
# =============================================================================
#         if 'X' in data_ds.coords:
#             data_ds = data_ds.rename({'X': 'x'})
#         if 'Y' in data_ds.coords:
#             data_ds = data_ds.rename({'Y': 'y'})
#         if 'latitude' in data_ds.coords:
#             data_ds = data_ds.rename({'latitude': 'lat'})
#         if 'longitude' in data_ds.coords:
#             data_ds = data_ds.rename({'longitude': 'lon'})
# =============================================================================
        if x_var == 'X':
            data_ds = data_ds.rename({'X': 'x'})
        if y_var == 'Y':
            data_ds = data_ds.rename({'Y': 'y'})
        if y_var == 'latitude':
            data_ds = data_ds.rename({'latitude': 'lat'})
        if x_var == 'longitude':
            data_ds = data_ds.rename({'longitude': 'lon'})
        
        # Standardize time coord
# ====== old standard time handling ===========================================
#         if len(time_coord) == 1:
#             data_ds = data_ds.rename({time_coord: 'time'})
# =============================================================================
        data_ds = geo.use_standard_time(data_ds, **time_kwargs)
        
        if isinstance(data_ds, gpd.GeoDataFrame):
            print("\nFormatting data...")
            # Add CRS
            crs_suffix = ''
            if include_crs:
                if crs is not None:
                    data_epsg = crs
                    data_ds.set_crs(crs = crs, 
                                    inplace = True, 
                                    allow_override = True)
                    # data_ds = standard_grid_mapping(data_ds, crs)
                    print(f'   _ Coordinates Reference System (epsg:{crs.to_epsg()}) included.')
                else:
                    print("   _ Warning: No `crs` argument was passed")
            else:
                print('   _ Coordinates Reference System not included.')
                crs_suffix = 'nocrs'
        
        elif isinstance(data_ds, xr.Dataset):        
            print("\nFormatting data...")
            # Add CRS
            crs_suffix = ''
            if include_crs:
                if crs is not None:
                    data_ds.rio.write_crs(crs, inplace = True)
                    print(f'   _ Coordinates Reference System (epsg:{crs.to_epsg()}) included.')
                else:
                    print("   _ Warning: No crs argument was passed")
            else:
                print('   _ Coordinates Reference System not included.')
                crs_suffix = 'nocrs'
            
            # Add standard attributes for coordinates (mandatory for QGIS to correctly read data)
            if ('x' in list(data_ds.coords)) & ('y' in list(data_ds.coords)):
                data_ds.x.attrs = {'standard_name': 'projection_x_coordinate',
                                    'long_name': 'x coordinate of projection',
                                    'units': 'metre'}
                data_ds.y.attrs = {'standard_name': 'projection_y_coordinate',
                                    'long_name': 'y coordinate of projection',
                                    'units': 'metre'}
                print("   _ Standard attributes added for coordinates x and y")
            elif ('lon' in list(data_ds.coords)) & ('lat' in list(data_ds.coords)):
                data_ds.lon.attrs = {'standard_name': 'longitude',
                                     'long_name': 'longitude',
                                     'units': 'degrees_east'}
                data_ds.lat.attrs = {'standard_name': 'latitude',
                                     'long_name': 'latitude',
                                     'units': 'degrees_north'}
                print("   _ Standard attributes added for coordinates lat and lon")


    # ---- General changes
    var_list = geo.main_var(data_ds)
    for var in var_list:
        for optional_attrs in ['AREA_OR_POINT', 'STATISTICS_MAXIMUM',
                               'STATISTICS_MEAN', 'STATISTICS_MINIMUM',
                               'STATISTICS_STDDEV', 'STATISTICS_VALID_PERCENT']:
            if optional_attrs in data_ds[var].attrs:
                data_ds[var].attrs.pop(optional_attrs)
    
    # ---- Export
    # ---------
    if export_opt == True:
        print('\nExporting...')
        # Output filepath
        if isinstance(data, (str, Path)):
            (folder_name, _basename) = os.path.split(data)
            (file_name, file_extension) = os.path.splitext(_basename)
            output_file = os.path.join(folder_name, f"{'_'.join([file_name, 'georef', crs_suffix])}{file_extension}")
        else:
            print("   _ As data input is not a file, the result is exported to a standard directory")
            output_file = os.path.join(os.getcwd(), f"{'_'.join(['data', 'georef', crs_suffix])}.nc")
        
        # Export
        geo.export(data_ds, output_file)
        
        
    # ---- Return variable
    return data_ds


    # =========================================================================
    #%    Mémos / Corrections bugs
    # =========================================================================
    # Si jamais il y a un problème de variable qui se retrouve en 'data_var'
    # au lieu d'etre en 'coords' : 
    #     data_ds = data_ds.set_coords('i')
    
    # S'il y a un problème d'incompatibilité 'missing_value' / '_FillValue' :
    #     data_ds['lon'] = data_ds.lon.fillna(np.nan)
    #     data_ds['lat'] = data_ds.lat.fillna(np.nan)
    
    # Si jamais une variable non essentielle pose problème à l'export : 
    #     data_ds = data_ds.drop('lon')
    #     data_ds = data_ds.drop('lat')
    
    # Pour trouver les positions des valeurs nan :
    #     np.argwhere(np.isnan(data_ds.lon.values))
    
    # Pour reconvertir la date
    #     units, reference_date = ds.time.attrs['units'].split('since')
    #     ds['time'] = pd.date_range(start = reference_date, 
    #                                periods = ds.sizes['time'], freq = 'MS')
    # =========================================================================


    # Créer les coordonnées 'x' et 'y'...
# =============================================================================
#         # ... à partir des lon.lat :
#         # LAISSÉ TOMBÉ, PARCE QUE LEURS VALEURS DE LATITUDE SONT FAUSSES [!]
#         coords_xy = rasterio.warp.transform(rasterio.crs.CRS.from_epsg(4326), 
#                                             rasterio.crs.CRS.from_epsg(27572), 
#                                             np.array(data_ds.lon).reshape((data_ds.lon.size), order = 'C'),
#                                             np.array(data_ds.lat).reshape((data_ds.lat.size), order = 'C'))
#         
#         # data_ds['x'] = np.round(coords_xy[0][0:data_ds.lon.shape[1]], -1) 
#         
#         # Arrondi à la dizaine à cause de l'approx. initiale sur les lat/lon :                                               
#         x = np.round(coords_xy[0], -1).reshape(data_ds.lon.shape[0], 
#                                                data_ds.lon.shape[1], 
#                                                order = 'C')
#         # donne un motif qui se répète autant de fois qu'il y a de latitudes
#         y = np.round(coords_xy[1], -1).reshape(data_ds.lon.shape[1], 
#                                                data_ds.lon.shape[0], 
#                                                order = 'F')
#         # donne un motif qui se répète autant de fois qu'il y a de longitudes
#         
#         # data_ds['x'] = x[0,:] 
#         # data_ds['y'] = y[0,:]
#         data_ds = data_ds.assign_coords(x = ('i', x[0,:]))
#         data_ds = data_ds.assign_coords(y = ('j', y[0,:]))
# =============================================================================


#%% FILE MANAGEMENT
###############################################################################

# Shorten names
def shortname(data, data_type, ext = 'nc'):
    """
    ext : str
        
    """
    
    if ext is not None:
        ext = ext.replace('.', '')
    
    if os.path.isfile(data):
        root_folder = os.path.split(data)[0]
    else:
        root_folder = data
        
    if os.path.isdir(data):
        content_list = [os.path.join(data, f)
                     for f in os.listdir(data)]
    else:
        content_list = [data]
    
    # ---- DRIAS-Eau 2024 Indicateurs (netcdf)
    # Indicateurs SAFRAN (SWIAV, SSWI-3...) "EXPLORE2-SIM2 2024" 
    if data_type.casefold().replace(' ', '').replace('-', '') in [
            'indicateursim2024', 'indicateurssim2024','indicateurdriaseau2024',
            'indicateursdriaseau2024']:
        
        folder_pattern = re.compile('Indicateurs_(.*Annuel|.*Saisonnier)_(.*)_MF_ADAMONT_(.*)_SIM2')
        
        for elem in content_list:
            # Raccourcir le nom
            if os.path.isdir(elem):
                res = folder_pattern.findall(elem)
                if len(res) > 0:
                    outpath = '_'.join(res[0])
                    os.rename(elem, os.path.join(root_folder, outpath))
                    
    # ---- Remove custom added date-time
    elif data_type.casefold().replace(' ', '').replace('-', '') in [
            'removedate']:
        # This option enables to remove the '2024-11-06_12h53'-like suffixes
        # that I have added to file names (in order to trace the date of creation,
        # but it is unnecessary and causes file names to be too long for Windows).
        
        datetime_pattern = re.compile('(.*)_\d{4,4}-\d{2,2}-\d{2,2}_\d{2,2}h\d{2,2}.'+f'{ext}')
        
        for elem in content_list:
            if len(datetime_pattern.findall(elem)) > 0:
                os.rename(os.path.join(root_folder, elem), 
                          os.path.join(root_folder, datetime_pattern.findall(elem)[0] + f'.{ext}'))
            elif os.path.isdir(os.path.join(root_folder, elem)):
                shortname(os.path.join(root_folder, elem), data_type = data_type, ext = ext)
        

###############################################################################
def remove_double(data_folder, data_type):
    """
    The data downloaded from DRIAS website contains *some* data sets 
    available for different periods (it is the same data, but one version
    goes from 1950 to 2005, and another from 1970 to 2005). For some models,
    there are no duplicate of that sort.
    These duplicates are unnecessary. This script moves these duplicates to a
    subfolder named "doublons".
    
    Example
    -------
    folder_list = [r"Eau-SWIAV_Saisonnier_EXPLORE2-2024_historical", 
                   r"Eau-SWIAV_Saisonnier_EXPLORE2-2024_rcp45", 
                   r"Eau-SWIAV_Saisonnier_EXPLORE2-2024_rcp85"]
    root_folder = r"D:\2- Postdoc\2- Travaux\1- Veille\4- Donnees\8- Meteo\DRIAS\DRIAS-Eau\EXPLORE2-SIM2 2024\Pays Basque\Indicateurs"
    
    for folder in folder_list:
        base_folder = os.path.join(root_folder, folder)
        subfolder_list = [os.path.join(base_folder, f)
                     for f in os.listdir(base_folder) 
                     if os.path.isdir(os.path.join(base_folder, f))]
        for subfolder in subfolder_list:
            gc.remove_double(subfolder, data_type = "indicateurs drias eau 2024")
    """
    
    # ---- DRIAS-Eau 2024 Indicateurs (netcdf)
    # Indicateurs SAFRAN (SWIAV, SSWI-3...) "EXPLORE2-SIM2 2024" 
    if data_type.casefold().replace(' ', '').replace('-', '') in [
            'indicateursim2024', 'indicateurssim2024','indicateurdriaseau2024',
            'indicateursdriaseau2024']:
                
        content_list = [os.path.join(data_folder, f)
                     for f in os.listdir(data_folder) 
                     if (os.path.isfile(os.path.join(data_folder, f)) \
                         & (os.path.splitext(os.path.join(data_folder, f))[-1] == '.nc'))]
        
        model_pattern = re.compile("(.*)_(\d{4,4})-(\d{4,4})_(historical|rcp45|rcp85)_(.*)_(SIM2.*)")
        model_dict = {}
        
        # List all models
        for elem in content_list:
            filename = os.path.split(elem)[-1]
            var, date, datef, scenario, model, creation_date = model_pattern.findall(filename)[0]
            date = int(date)
            if model in model_dict:
                model_dict[model].append(date)
            else:
                model_dict[model] = [date]
        
        # Move duplicates
        if not os.path.exists(os.path.join(data_folder, "doublons")):
            os.makedirs(os.path.join(data_folder, "doublons"))
        
        for model in model_dict:
            if len(model_dict[model]) > 1:
                filename = '_'.join([var, f"{max(model_dict[model])}-{datef}", scenario, model, creation_date])
                os.rename(
                    os.path.join(data_folder, filename),
                    os.path.join(data_folder, 'doublons', filename)
                    )


###############################################################################
def name_xml_attributes(*, output_file, fields):  
    band_str_open = '\t<PAMRasterBand band="{}">\t\t<Description>{}'
    band_str_close = '</Description>\t</PAMRasterBand>\n</PAMDataset>'
    base_str = ''.join(['<PAMDataset>\n',
                       '\t<Metadata>\n',
                       '\t\t<MDI key="CRS">epsg:27572</MDI>\n',
                       '\t</Metadata>\n',
                       len(fields) * (band_str_open + band_str_close)])
    
    band_ids = [str(item) for item in list(range(1, len(fields)+1))]
    # Create a string list of indices: '1', '2', '3' ...
    addstr = [item for pair in zip(band_ids, fields) for item in pair]
    # Create a list alternating indices and names: '1', 'CLDC', '2', 'T2M' ...
    
    file_object = open(output_file + '.aux.xml', 'w')
    file_object.write(base_str.format(*addstr))
    file_object.close()


#%% PORTAL STANDARDIZATION
def by_name(data, data_name, **kwargs):
    
    # ---- Données topographiques
    #%%% IGN BD ALTI 25m (DEM)
    if data_name.casefold().replace(' ', '') in ["alti", "bdalti", "ignalti"]:
        return bdalti(data)
    
    # ---- Données climatiques
    #%%% SIM2 (csv to netcdf)
    # Données SAFRAN-ISBA (T, PRELIQ, ETP, EVAPC, SWI, RUNC... ) de réanalyse historique "SIM2" 
    elif data_name.replace(" ", "").casefold() in ['sim2']:
        return sim2(data)
    
    #%%% DRIAS-Climat 2022 (netcdf)
    # Données SAFRAN (tas, prtot, rayonnement, vent... ) "EXPLORE2-Climat 2022" 
    elif data_name.replace(" ", "").casefold() in ['drias2022', 'climat2022', 'drias-climat2022']:
        return explore2climat(data)
    
    #%%% DRIAS-Eau 2024 (netcdf)
    # Données SURFEX (evapc, drainc, runoffc, swe, swi...) "EXPLORE2-SIM2 2024" 
    elif data_name.replace(" ", "").casefold() in ['sim2024', 'drias-eau2024',
                                                   'driaseau2024']:
        return explore2eau(data)
    
    #%%% * DRIAS
    elif data_name.casefold() in ["drias"]:
        return drias(data)
    
    #%%% C3S seasonal forecast
    # https://cds.climate.copernicus.eu/datasets/seasonal-original-single-levels?tab=overview
    elif data_name.casefold() in ["c3s"]:
        return c3s(data)
    
    #%%% ERA5
    elif data_name.casefold().replace('-', '') in ["era5", "era5land", "era"]:
        return era5(data)
    
    # ---- Données d'usage de l'eau
    #%%% Water withdrawals BNPE
    elif data_name.casefold() in ['bnpe']:
        return bnpe(data, **kwargs)
    
    # ---- Données de débit
    #%%% Hydrométrie eaufrance
    elif data_name.casefold() in ['hydrometrie', 'hydrométrie', 'hydrometry',
                                  'débit', 'debit', 'discharge']:
        return hydrometry(data, **kwargs)
    
    # ---- Données d'occupation des sols
    #%%% ° Crop coefficients, version "better"
    elif data_name.casefold() in ['crop', 'crop coeff', 'crop coefficient']:
        return cropcoeff(data)
    
    #%%% ° CES OSO
    elif data_name.casefold() in ['ces oso', 'oso', 'ces']:
        return cesoso(data)
    
    #%%% ° CORINE Land Cover
    elif data_name.casefold() in ["corine", "cld", "corine land cover"]:
        return corine(data)
    
    #%%% ° Observatoire botanique de Brest (2018-2019)
    elif data_name.casefold() in ["observatoire", "obb"]:
        return obb(data)
    
    # ---- Données de sol
    #%%% ° Soil depth UCS Bretagne
    elif data_name.casefold().replace(' ', '').replace('-', '').replace('_', '') in ["ucsbretagne", "ucsbzh", "ucsppbzh"]:
        return ucsbretagne(data)

#%% STANDARDIZE FUNCTIONS
# =============================================================================
# def convert_to_cwatm(data, data_type, reso_m = None, EPSG_out=None, 
#                         EPSG_in = None, coords_extent = 'Bretagne'):
# # =============================================================================
# #     previously prepare_CWatM_input(*, data, data_type, reso_m = None, 
# #                                    EPSG_out, EPSG_in = None, 
# #                                    coords_extent = 'Bretagne'):
# # =============================================================================
# 
#     
# # ====== Ancienne version du script ===========================================
# #     (input_folder, _file_name) = os.path.split(data)
# #     (file_name, file_extension) = os.path.splitext(_file_name)
# # =============================================================================
# =============================================================================

def era5(data):

    # ---- Merge files
    # Usually ERA5-Land files are too big to be downloaded for the full period.
    # Here the first step is to merge the available ERA5.nc files.
    merged_ds = geo.merge_data(data)
    
    var_list = geo.main_var(data)
    
    # ---- Convert hourly to daily
    print("   _ Converting hourly to daily...")
    modelist = { # to differentiate extensive from intensive quantities
        'ro': 'sum',
        'sro': 'sum',
        'ssro': 'sum',
        'evavt': 'sum',
        'evatc': 'sum',
        'pev': 'sum',
        'tp': 'sum',
        'fal': 'mean',
        'rh': 'mean',
        'ssrd': 'mean',
        'strd': 'mean',
        'sp': 'mean',
        't2m': 'mean',
        'd2m': 'mean',
        'u10': 'mean',
        'v10': 'mean',
        'wind_speed': 'mean'}
    
    daily_ds = xr.Dataset()
    
    for var in var_list:
        daily_ds[var] = geo.hourly_to_daily(merged_ds, mode = modelist[var])
        
        # ---- Convert units when needed
        if var in ['ssrd', 'strd']:
            print("   _ Converting radiation units from J/m²/h to W/m²")
            daily_ds[var] = convert_downwards_radiation(daily_ds[var])
        
        elif var == 'pev':
            print("   _ Converting negative potential evapotranspiration to positive")
            # Backup of attributes and encodings
            attrs = daily_ds[var].attrs.copy()
            encod = daily_ds[var].encoding.copy()
            
            daily_ds[var] = daily_ds[var]*-1
            
            # Transferring attributes and encodings
            daily_ds[var].encoding = encod
            daily_ds[var].attrs = attrs
            
            # Case of packing
            if ('scale_factor' in daily_ds[var].encoding) | ('add_offset' in daily_ds[var].encoding):
                # Packing (lossy compression) induces a loss of precision of 
                # apprx. 1/1000 of unit, for a quantity with an interval of 150 
                # units. The packing is initially used in some original ERA5-Land data.
                print("      . Correcting packing encodings...")
                daily_ds[var].encoding['add_offset'] = daily_ds[var].encoding['add_offset']*-1

    
    return daily_ds
        

    

[docs]
def bdalti(data,
           *, to_file = False):
    """
    Standardize the DEM data from IGN's ALTI Database (https://geoservices.ign.fr/bdalti):
    
    - merge tiles
    - georeference (embed CRS = 2154, standardize `_FillValue`, `grid_mapping` and `time`)
    - replace nodata values (-99999.0) with np.nan
    

    Parameters
    ----------
    data : (list of) str or pathlib.Path, or variable (xarray.Dataset or xarray.DataArray)
        ``data`` is usually the folder containing all the raw BDALTI tiles downloaded from
        IGN website. It can also be a list of filepaths or xarray variables. 
        It can also be a single filepath or xarray variable.
    to_file : bool or path (str or pathlib.Path), default False
        If True and if ``data`` is a path (str or pathlib.Path), the resulting
        dataset will be exported to the same location as ``data``, with the name 'BDALTIV2_stz.tif'.
        If ``to_file`` is a path, the resulting dataset will be exported to this specified filepath.

    Returns
    -------
    data_ds : xarray.Dataset
        Standardized DEM.
    If ``to_file`` argument is used, the standardized DEM is exported to a file. 
    If ``to_file = True``, the standardized DEM is exported to the GeoTIFF file
    'BDALTIV2_stz.tif'.
    
    Examples
    ---------
    >>> stzfr.bdalti(
        r"E:\Inputs\DEM\IGN\BDALTIV2_2-0_25M_ASC_LAMB93-IGN69", 
        to_file = True
        )
    Exporting...
       _ Success: The data has been exported to the file 'E:\Inputs\DEM\IGN\BDALTIV2_2-0_25M_ASC_LAMB93-IGN69_PAYS_BASQUE\BDALTIV2_stz.tif'
    <xarray.Dataset> Size: 252MB
    Dimensions:      (x: 7000, y: 9000)
    Coordinates:
      * x            (x) float64 56kB 3e+05 3e+05 3e+05 ... 4.75e+05 4.75e+05
      * y            (y) float64 72kB 6.4e+06 6.4e+06 ... 6.175e+06 6.175e+06
        spatial_ref  int64 8B 0
    Data variables:
        elev         (y, x) float32 252MB nan nan nan nan nan ... nan nan nan nan
    

    """
    
    # ---- Merge tiles (if relevant)
    data_ds = geo.merge_data(data, extension = '.asc')
    
    # ---- Rename the main variable (ASCII files do not have a variable name)
    # (in case file is exported into a netCDF)
    if isinstance(data_ds, xr.Dataset):
        main_var = geo.main_vars(data_ds)[0]
        data_ds = data_ds.rename({main_var: 'elev'})
    elif isinstance(data_ds, xr.DataArray):
        data_ds = data_ds.rename('elev')
        data_ds = data_ds.to_dataset() # convert into xarray.Dataset
    else:
        print(f"Error: BD ALTI data is not supposed to be a {type(data_ds)}")
        return
    
    # ---- Georeference
    data_ds = geo.georef(data_ds, crs = 2154)
    
    # ---- Replace nodata values (-99999.0) with NaN
# =============================================================================
#     encod = data_ds['elev'].encoding # store encodings (_FillValue, compression...)
#     data_ds['elev'] = data_ds['elev'].where(data_ds['elev'] != -99999) # replace nodata
#     data_ds['elev'].encoding = encod # transfer encodings
# =============================================================================
    data_ds['elev'].encoding['_FillValue'] = np.nan
    
    # ---- Export
    if to_file == True:
        if isinstance(data, (str, Path)):
            print('\nExporting...')
            # Determine the filename
            if os.path.isdir(data): 
# =============================================================================
#                 data_folder, filelist = geo.get_filelist(data, extension = '.asc')
#                 stem = os.path.join(data_folder, filelist[0])
# =============================================================================
                stem = os.path.join(data, 'BDALTIV2.tif')
            elif os.path.isfile(data):
                stem = data
            
            # Export
            export_filepath = os.path.splitext(stem)[0] + "_stz" + ".tif"
            geo.export(data_ds, export_filepath)
            
        else:
            print("Warning; `data` should be a path (str or pathlib.Path) for using `to_file=True`.")
        
    elif isinstance(to_file, (str, Path)):
        print('\nExporting...')
        geo.export(data_ds, to_file)

    # ---- Returns results as a GEOP4TH variable (xr.Dataset)
    return data_ds




[docs]
def sim2(data,
         *, var_list = None,
         to_file = False,
         merge_csv = True):
    r"""
    Standardizes MeteoFrance "SIM2" datasets downloaded as .csv files from data.gouv website
    (https://www.data.gouv.fr/datasets/donnees-changement-climatique-sim-quotidienne)
    into standard netCDF files by:
    
    - merge all available .csv files
    - add a total_precipitation variable
    - convert .csv files into xarray.Datasets variables, which can then be 
      exported to netCDF files
    - standardize the x and y values
    - embedd the CRS (epsg:27572)
    - standardize netCDF names and attributes (coordinates, grid_mapping)
    - standardize units

    Parameters
    ----------
    data : (list of) str or pathlib.Path, or variable (pandas.Dataframe)
        ``data`` is usually the folder containing all the raw SIM2 tiles downloaded from
        the data.gouv website [https://www.data.gouv.fr/datasets/donnees-changement-climatique-sim-quotidienne]. 
        It can also be a list of filepaths or pandas variables. 
        It can also be a single filepath or pandas variable.
    var_list : iterable, optional
        List of variable names that will be standardized.
        Works with SIM2 variable names: 'DRAINC_Q' | 'RUNC_Q' | 'EVAP_Q' | 'PRETOT_Q' | 'T_Q' ...
        The '_Q' is optional ('DRAINC', 'RUNC'... also work).
        
        If None, all available variables will be standardized.
    to_file : bool or path (str or pathlib.Path), default False
        If True and if ``data`` is a path (str or pathlib.Path), the resulting
        dataset will be exported to the same location as ``data``, with the name 'SIM2_stz.tif'.
        If ``to_file`` is a path, the resulting dataset will be exported to this specified filepath.
    merge_csv : bool, default True
        Deactivate this ooption (with ``merge_csv = False``) to avoid memory issues 
        (processing will be much slower).

    Returns
    -------
    data_ds : xarray.Dataset
        Standardized meteorological data.
    If ``to_file`` argument is used, the standardized DEM is exported to a file. 
    If ``to_file = True``, the standardized DEM is exported to the GeoTIFF file
    'SIM2_stz.tif'.
    
    Examples
    --------
    >>> stzfr.sim2(
        r"E:\Inputs\Climate\SIM2\csv", 
        to_file = True
        )
    Exporting...
       _ Success: The data has been exported to the file 'E:\Inputs\Climate\SIM2\csv\SIM2_stz.nc'
    <xarray.Dataset> Size: 2GB
    Dimensions:        (time: 518, y: 134, x: 143)
    Coordinates:
      * time           (time) datetime64[ns] 4kB 1958-08-01 ... 2025-08-20
      * y              (y) int64 1kB 1617000 1625000 1633000 ... 2673000 2681000
      * x              (x) int64 1kB 60000 68000 76000 ... 1180000 1188000 1196000
        spatial_ref    int64 8B 0
    Data variables: (12/26)
        PRENEI_Q       (time, y, x) float64 79MB nan nan nan nan ... nan nan nan nan
        PRELIQ_Q       (time, y, x) float64 79MB nan nan nan nan ... nan nan nan nan
        T_Q            (time, y, x) float64 79MB nan nan nan nan ... nan nan nan nan
        FF_Q           (time, y, x) float64 79MB nan nan nan nan ... nan nan nan nan
        Q_Q            (time, y, x) float64 79MB nan nan nan nan ... nan nan nan nan
        DLI_Q          (time, y, x) float64 79MB nan nan nan nan ... nan nan nan nan
                ...
        ECOULEMENT_Q   (time, y, x) float64 79MB nan nan nan nan ... nan nan nan nan
        WG_RACINE_Q    (time, y, x) float64 79MB nan nan nan nan ... nan nan nan nan
        WGI_RACINE_Q   (time, y, x) float64 79MB nan nan nan nan ... nan nan nan nan
        TINF_H_Q       (time, y, x) float64 79MB nan nan nan nan ... nan nan nan nan
        TSUP_H_Q       (time, y, x) float64 79MB nan nan nan nan ... nan nan nan nan
        PRETOT_Q       (time, y, x) float64 79MB nan nan nan nan ... nan nan nan nan
    """
    
# =============================================================================
#     # Needed columns
#     usecols = ['LAMBX', 'LAMBY', 'DATE']
# =============================================================================

    # Units and long names (from liste_parametres.odt https://www.data.gouv.fr/fr/datasets/r/d1ffaf5e-7d15-4fb5-a34c-f76aaf417b46)
    units_by_var = {
                 'PRENEI_Q': ['mm', 'Précipitations solides (cumul quotidien 06-06 UTC)'], 
                 'PRELIQ_Q': ['mm', 'Précipitations liquides (cumul quotidien 06-06 UTC)'], 
                 'T_Q': ['°C','Température (moyenne quotidienne)'], 
                 'FF_Q': ['m/s', 'Vitesse du vent (moyenne quotidienne)'], 
                 'Q_Q': ['g/kg','Humidité spécifique (moyenne quotidienne)'], 
                 'DLI_Q': ['J/cm2', 'Rayonnement atmosphérique (cumul quotidien)'],
                 'SSI_Q': ['J/cm2', 'Rayonnement visible (cumul quotidien)'], 
                 'HU_Q': ['%', 'Humidité relative (moyenne quotidienne)'], 
                 'EVAP_Q': ['mm', 'Evapotranspiration réelle (cumul quotidien 06-06 UTC)'], 
                 'ETP_Q': ['mm', 'Evapotranspiration potentielle (formule de Penman-Monteith)'], 
                 'PE_Q': ['mm', 'Pluies efficaces (cumul quotidien)'], 
                 'SWI_Q': ['%', "Indice d'humidité des sols (moyenne quotidienne 06-06 UTC)"],
                 'DRAINC_Q': ['mm', 'Drainage (cumul quotidien 06-06 UTC)'], 
                 'RUNC_Q': ['mm', 'Ruissellement (cumul quotidien 06-06 UTC)'], 
                 'RESR_NEIGE_Q': ['mm', 'Equivalent en eau du manteau neigeux (moyenne quotidienne 06-06 UTC)'], 
                 'RESR_NEIGE6_Q': ['mm', 'Equivalent en eau du manteau neigeux à 06 UTC'], 
                 'HTEURNEIGE_Q': ['m', 'Epaisseur du manteau neigeux (moyenne quotidienne 06-06 UTC)'], 
                 'HTEURNEIGE6_Q': ['m', 'Epaisseur du manteau à 06 UTC'], 
                 'HTEURNEIGEX_Q': ['m', 'Epaisseur du manteau neigeux maximum au cours de la journée'], 
                 'SNOW_FRAC_Q': ['%', 'Fraction de maille recouverte par la neige (moyenne quotidienne 06-06 UTC)'], 
                 'ECOULEMENT_Q': ['mm', 'Ecoulement à la base du manteau neigeux'], 
                 'WG_RACINE_Q': ['mm','Contenu en eau liquide dans la couche racinaire à 06 UTC'], 
                 'WGI_RACINE_Q': ['mm', 'Contenu en eau gelée dans la couche de racinaire à 06 UTC'], 
                 'TINF_H_Q': ['°C', 'Température minimale des 24 températures horaires'], 
                 'TSUP_H_Q': ['°C', 'Température maximale des 24 températures horaires'],
                 'PRETOT_Q': ['mm', 'Précipitations totales (cumul quotidien 06-06 UTC)'], 
                 }
    # NB: Cumulated values (day 1) are summed from 06:00 UTC (day 1) to 06:00 UTC (day 2)
    # Therefore, days correspond to Central Standard Time days.
    
    # ---- Initialization
    if isinstance(var_list, str):
        var_list = [var_list]
    else:
        var_list = list(var_list)
    for i in range(0, len(var_list)):
        if var_list[i][-2:] != '_Q':
            var_list[i] = var_list[i] + '_Q'
    
    # ---- Merge tiles (if relevant)
    if isinstance(data, (str, Path)):
        _, filelist = geo.get_filelist(data, extension = '.csv')
        _, latest = geo.get_filelist(data, extension = '.csv', tag = 'latest')
        n_files = len(filelist) - len(latest)
        print(f"Merging SIM2 files requires a significant amount of RAM: {n_files*7 + 7}Go is advised")
        print("Otherwise, pass the argument `merge_csv = False` (slower but less memory errors)")
    
    if merge_csv:
        df = geo.merge_data(data, 
                            extension = '.csv',
                            sep = ';',
                            header=0, 
                            decimal='.',
                            parse_dates=['DATE'],
                            # date_format='%Y%m%d', # Not available before pandas 2.0.0
                            )
        
        datalist = [df]
    
    else:
        print("Info: in this `merge_csv = False` mode, only the standardized data corresponding to the last input data will be returned")
        # Loading list  
        data_folder, filelist = geo.get_filelist(data, extension = '.csv')   
        datalist = [os.path.join(data_folder, f) for f in filelist]
    
    # ---- (re)Loading
    for d in datalist:
        df = geo.load_any(d, sep=';', 
                          header=0, decimal='.',
                          parse_dates=['DATE'],
                          # date_format='%Y%m%d', # Not available before pandas 2.0.0
                          )
        
        # ---- Keep only desired columns
        if var_list is not None:
            df = df[['DATE', 'LAMBX', 'LAMBY'] + var_list]
    
        # ---- Formatting    
        df.rename(columns = {'LAMBX': 'x', 'LAMBY': 'y', 'DATE': 'time'}, inplace = True)
        df[['x', 'y']] = df[['x', 'y']]*100 # convert hm to m
        df.set_index(['time', 'y', 'x'], inplace = True)
        
        # Add a column total precipitations when possible
        if ('PRENEI_Q' in df.columns) & ('PRELIQ_Q' in df.columns):
            df['PRETOT_Q'] = df['PRENEI_Q'] + df['PRELIQ_Q']
            print("   New column added: PRETOT_Q = PRENEI_Q + PRELIQ_Q")
        
        # ---- Convert to xarray.Dataset and keep formatting
        data_ds = df.to_xarray()
        # Continuous axis
        data_ds = data_ds.reindex(x = np.arange(data_ds.x.min().values, data_ds.x.max().values + 8000, 8000))
        data_ds = data_ds.reindex(y = np.arange(data_ds.y.min().values, data_ds.y.max().values + 8000, 8000))
        # Include CRS and standardize attributes
        data_ds = geo.georef(data_ds, crs = 27572)
    
        # ---- Transferring metadata and standardizing units 
        print("Treating units and metadata...")
    
        for var in list(data_ds.data_vars): # batch_var: 
            # Include metadata
            data_ds[var].attrs = {
                # 'standard_name': var, # it is not standard according to CF Convention 
                # (https://cfconventions.org/Data/cf-standard-names/current/build/cf-standard-name-table.html)
                'long_name': units_by_var[var][1],
                'units': units_by_var[var][0]}   
            
            # Standarize units
            if units_by_var[var][0] == 'mm': # mm -> m
                data_ds[var] = geo.convert_unit(data_ds[var], '/1000')
    # =============================================================================
    #         elif units_by_var[var][0] == '°C':
    #             data_ds[var] = convert_unit(data_ds[var], '+273.15')
    # =============================================================================
            elif units_by_var[var][0] == 'J/cm2': # J/cm² (daily sum) ->  W/m² 
                data_ds[var] = geo.convert_unit(data_ds[var], '/8.64')
                # *10**4 to convert cm² to m² ; and /86400 to convert J/d to W
        
            # ---- Export
            if to_file == True:
                if isinstance(data, (str, Path)):
                    print('\nExporting...')
                    # Determine the filename
                    if os.path.isdir(data): 
                        stem = os.path.join(data, f'{var}_SIM2.nc')
                    elif os.path.isfile(data):
                        stem = os.path.join(
                            os.path.split(data)[0], 
                            f'{var}_' + os.path.split(data)[-1])
                    
                    # Export
                    export_filepath = os.path.splitext(stem)[0] + "_stz" + ".nc"
                    geo.export(data_ds[[var]], export_filepath)
                    
                else:
                    print("Warning; `data` should be a path (str or pathlib.Path) for using `to_file=True`.")
                
            elif isinstance(to_file, (str, Path)):
                print('\nExporting...')
                geo.export(data_ds, 
                           os.path.join(
                               os.path.split(to_file)[0],
                               f'{var}_' + os.path.split(to_file)[-1])
                           )

    # ---- Returns results as a GEOP4TH variable (xr.Dataset)
    return data_ds

        
        

def explore2climat(data):
    # (netCDF)
    # Load
    data_ds = geo.load_any(data)
    
    # Get main variable
    var_list = geo.main_var(data_ds)
    print(f"   _ Main variable are: {', '.join(var_list)}")
    
    # Backup of attributes and encodings
    attrs_dict = {}
    encod_dict = {}
    for var in var_list:
        attrs_dict[var] = data_ds[var].attrs.copy()
        encod_dict[var] = data_ds[var].encoding.copy()
    
    # If it contains time_bnds (periods time)
    if ('time_bnds' in data_ds.coords) | ('time_bnds' in data_ds.data_vars):
        # The closing bounds is used for time coordinates
        time_attrs = data_ds['time'].attrs.copy()
        data_ds['time'] = data_ds.time_bnds.loc[{'bnds': 1}]
        data_ds['time'].attrs = time_attrs
        # time_bnds and bnds are removed
        data_ds = data_ds.drop_dims('bnds')
        print("   _ 'time_bnds' and 'bnds' removed")
    
    # Convert units
    for var in var_list:
        if var in ['evspsblpotAdjust', 'prsnAdjust', 'prtotAdjust']:
            data_ds[var] = data_ds[var]/1000*(60*60*24) # conversion from mm.s-1 to m.d-1
            data_ds[var].attrs = attrs_dict[var]
            data_ds[var].attrs['units'] = 'm.d-1'
            data_ds[var].encoding = encod_dict[var]
            print('   _ Units converted from mm/s to m/d')
    
# =============================================================================
#         # Remove compression (too long to load in QGIS otherwise)
#         data_ds[var].encoding['zlib'] = False
# =============================================================================
    
    return data_ds
        
    
def explore2eau(data):
    # Load
    xr_kwargs = {}
# =============================================================================
#         # Particular case for SSWI (grid_mapping is bugged)
#         sswi_pattern = re.compile('.*(SSWI).*')
#         if len(sswi_pattern.findall(data)) > 0:
#             xr_kwargs['drop_variables'] = 'LambertParisII'
# =============================================================================
    xr_kwargs['drop_variables'] = 'LambertParisII'
    
    data_ds = geo.load_any(data, **xr_kwargs)
    
    # Get main variable
    var_list = geo.main_var(data_ds)
    print(f"   _ Main variables are: {', '.join(var_list)}")
    
    # Backup of attributes and encodings
    attrs_dict = {}
    encod_dict = {}
    for var in var_list:
        attrs_dict[var] = data_ds[var].attrs.copy()
        encod_dict[var] = data_ds[var].encoding.copy()
    
    # If it contains time_bnds (periods time)
    if ('time_bnds' in data_ds.coords) | ('time_bnds' in data_ds.data_vars):
        # The closing bounds is used for time coordinates
        attributes = data_ds['time'].attrs.copy()
        data_ds['time'] = data_ds.time_bnds.loc[{'bnds': 1}]
        data_ds['time'].attrs = attributes
        # time_bnds and bnds are removed
        data_ds = data_ds.drop_dims('bnds')
        print("   _ 'time_bnds' and 'bnds' removed")
    
    # Convert units
    for var in var_list:
        if var in ['DRAINC', 'EVAPC', 'RUNOFFC']:
            data_ds[var] = data_ds[var]/1000 # conversion from mm.d-1 to m.d-1
            data_ds[var].attrs = attrs_dict[var]
            data_ds[var].attrs['units'] = 'm.d-1'
            data_ds[var].encoding = encod_dict[var]
            print('   _ Units converted from mm/d to m/d')
    
# =============================================================================
#         # Particular case for SSWI (_FillValue is wrong)
#         if ('missing_values' in data_ds[var].encoding) \
#             & ('_FillValue' in data_ds[var].encoding):
#             print(data_ds[var].encoding['missing_value'])
#             print(data_ds[var].encoding['_FillValue'])
# =============================================================================
# =============================================================================
#             data_ds[var].encoding['_FillValue'] = 999999
#             data_ds[var].encoding.pop('missing_value')
# =============================================================================
# ======= NOT ACCURATE ENOUGH =================================================
#         # Also, if SSWI and seasonal, dates should be shifted 45 days back
#         sswi_season_pattern = re.compile('.*(SSWI_seas).*')
#         if len(sswi_season_pattern.findall(data)) > 0:
#             encodings = data_ds['time'].encoding.copy()
#             data_ds['time'] = data_ds['time'] - pd.Timedelta(days = 45)
#             data_ds['time'].encoding = encodings
# =============================================================================
    
# =============================================================================
#         # Remove compression (too long to load in QGIS otherwise)
#         data_ds[var].encoding['zlib'] = False
# =============================================================================
    
    return data_ds
    

def c3s(data):
    data_ds = geo.load_any(data, 
                       # decode_times = True, 
                       # decode_coords = 'all'
                       )

    # Get main variable
    var = geo.main_var(data_ds)
    print(f"   _ Main variables are: {', '.join(var)}")
    # Backup of attributes and encodings
    attrs = dict()
    encod = dict()
    for v in var:
        attrs[v] = data_ds[v].attrs.copy()
        encod[v] = data_ds[v].encoding.copy()
    
    # Format time coordinate
    if ('forecast_reference_time' in data_ds.coords) | ('forecast_reference_time' in data_ds.data_vars):
        data_ds = data_ds.squeeze('forecast_reference_time').drop('forecast_reference_time')
    if ('forecast_period' in data_ds.coords) | ('forecast_period' in data_ds.data_vars):
        data_ds = data_ds.drop('forecast_period')
    
# =========== implemented in load_any =========================================
#         data_ds = use_standard_time(data_ds)
# =============================================================================
    
    return data_ds
        
    
def bnpe(data, extension = '.json'):
    data_folder, filelist = geo.get_filelist(data, extension = extension)
    folder_root = os.path.split(data_folder)[0]
    
    # Complete withdrawals timeseries by merging each file with water origin
    # =========
    # Retrieve infos ouvrages
    folder_ouvrage, filelist_ouvrage = geo.get_filelist(os.path.join(data, "ouvrages"), extension = extension)
    ouvrage_list = [geo.load_any(os.path.join(folder_ouvrage, f), sep = ';') for f in filelist_ouvrage]
    ouvrages = pd.concat(ouvrage_list, axis = 0, ignore_index = True).drop_duplicates()
    ouvrages.reset_index(drop = True, inplace = True)
    
# =============================================================================
#     if not os.path.exists(os.path.join(folder_root, "allinfo")):
#         os.makedirs(os.path.join(folder_root, "allinfo"))
# =============================================================================
    
    allinfo_gdf_list = []
    
    # Chroniques de prélèvement
    for f in filelist:
        temp_gdf = geo.load_any(os.path.join(data_folder, f), sep = ";")
        
        # Case where the files are in JSON format instead of GeoJSON:
        if isinstance(temp_gdf, dict):
            temp_gdf = pd.DataFrame.from_dict(temp_gdf['data'])
            geometry = [Point(xy) for xy in zip(temp_gdf.longitude, 
                                                temp_gdf.latitude)]
            temp_gdf = gpd.GeoDataFrame(
                temp_gdf,
                crs = 4326,
                geometry = geometry)
            
        # Combine ouvrage info and withdrawal timeseries
        temp_gdf = pd.merge(temp_gdf, ouvrages[['code_ouvrage', 
                                                'code_type_milieu', 
                                                'libelle_type_milieu']], 
                            on = 'code_ouvrage', how = 'left')
        
        # Complete allinfo_gdf_list
        allinfo_gdf_list.append(temp_gdf)
        
# =============================================================================
#         # For each year, export a json file including water origin information
#         geo.export(temp_gdf, 
#                os.path.join(folder_root, "allinfo", f),
#                encoding='utf-8')
# =============================================================================
    
    # Merge all files into a single vector one
    # =========
    merged_gdf = geo.merge_data(data = allinfo_gdf_list)
# ========= flat gdf considered useless =======================================
#     flat_merged_gdf = geo.merge_data(data = allinfo_gdf_list, flatten = True)
# =============================================================================
    
    merged_gdf['time'] = merged_gdf.annee.astype(str) + '-12-31'
    merged_gdf['time'] = pd.to_datetime(merged_gdf['time'], format = "%Y-%m-%d")
    
    # Export
    # =========
# =============================================================================
#     if not os.path.exists(os.path.join(folder_root, "concat")):
#         os.makedirs(os.path.join(folder_root, "concat"))
# =============================================================================
        
# ====== function returns a variable instead of exporting =====================
#     year_pattern = re.compile("\d{4,4}")
#     year_start = int(year_pattern.findall(os.path.splitext(filelist[0])[0])[0])
#     year_end = int(year_pattern.findall(os.path.splitext(filelist[-1])[0])[0])
#     geo.export(merged_gdf, 
#                # os.path.join(folder_root, "concat", f"{year_start}-{year_end}.shp"),
#                os.path.join(folder_root, "concat", f"{year_start}-{year_end}.json"),
#                encoding='utf-8')
# =============================================================================
    
# ========= flat gdf considered useless =======================================
#     geo.export(flat_merged_gdf, 
#                # os.path.join(folder_root, "concat", f"{year_start}-{year_end}.shp"),
#                os.path.join(folder_root, "concat", f"{year_start}-{year_end}_flat.json"),
#                encoding='utf-8')
# =============================================================================
# ========= flat gdf considered useless =======================================
#     return {f"{year_start}-{year_end}.json": merged_gdf,
#             f"{year_start}-{year_end}_flat.json": flat_merged_gdf}
# =============================================================================
    
    return merged_gdf


def hydrometry(data, extension = '.json'):
    data_folder, filelist = geo.get_filelist(data, extension = extension)
    folder_root = os.path.split(data_folder)[0]
    
    # Complete withdrawals timeseries by merging each file with water origin
    # =========
    # Retrieve infos on stations
    folder_stations, filelist_stations = geo.get_filelist(os.path.join(data, "stations"), extension = extension)
    stations_list = [gpd.read_file(os.path.join(folder_stations, f)) for f in filelist_stations]
    stations = pd.concat(stations_list, axis = 0, ignore_index = True).drop_duplicates()
    stations.reset_index(drop = True, inplace = True)
    
    allinfo_gdf_list = []
    
    # Chroniques de prélèvement
    for f in filelist:
        temp_gdf = geo.load_any(os.path.join(data_folder, f))
        
        # Case where the files are in JSON format instead of GeoJSON:
        if isinstance(temp_gdf, dict):
            temp_gdf = pd.DataFrame.from_dict(temp_gdf['data'])
            geometry = [Point(xy) for xy in zip(temp_gdf.longitude, 
                                                temp_gdf.latitude)]
            temp_gdf = gpd.GeoDataFrame(
                temp_gdf,
                crs = 4326,
                geometry = geometry)
            
        # Combine station info and discharge timeseries
        temp_gdf = pd.merge(temp_gdf, stations[['code_site',
                                                'code_station',
                                                'code_cours_eau', 
                                                'libelle_cours_eau', 
                                                'uri_cours_eau',
                                                'commentaire_influence_locale_station',
                                                'date_ouverture_station',
                                                ]], 
                            on = 'code_station', how = 'left')
        
        # Complete allinfo_gdf_list
        allinfo_gdf_list.append(temp_gdf)
    
    # Merge all files into a single vector one
    # =========
    merged_gdf = geo.merge_data(data = allinfo_gdf_list)
    
    merged_gdf['time'] = pd.to_datetime(merged_gdf['date_obs_elab'], format = "%Y-%m-%d")
    
    # Correct units
    # ========
    # convert l/s into m3/s, or mm into m
    merged_gdf = geo.convert_unit(merged_gdf, '/1000', var = 'resultat_obs_elab')
    
    return merged_gdf
        
    
    
#%%% * DRIAS
def drias(data):
    print(f'\nProcessed file = {data}')
    
    with xr.open_dataset(data, decode_coords = 'all') as data_ds:
        data_ds.load() # to unlock the resource
    
    #% Formating
    # -----------
    print('\nPre-formating...')
    # Drop useless variables
    data_ds = data_ds.drop(['i', 'j', 'lat', 'lon'])
    print('   _ Additional coords (i, j, lat, lon) dropped')
    # Add CRS
    if not EPSG_in: # EPSG = None, by default
        if 'spatial_ref' not in list(data_ds.coords) + list(data_ds.data_vars):
            EPSG_in = 27572
            data_ds.rio.write_crs(f"epsg:{EPSG_in}", inplace = True)
            print(f"   _ The Coordinates Reference System is not indicated in the original file. By default, epsg:{EPSG_in} is used")
        else:
            EPSG_in = data_ds.rio.crs.to_epsg()
    
# =============================================================================
#         data_ds = data_ds.rename(prtotAdjust = 'pr')
#         print(f"   _ Precipitation variable name modified: prtotAdjust -> pr")
# =============================================================================
    # Get main variable
    var = geo.main_var(data_ds)
    print(f"   _ Main variable is: {var}")
    
    data_ds[var] = data_ds[var]/1000*(60*60*24) # conversion from from mm.s-1 to m.d-1
    data_ds[var].attrs['units'] = 'm.d-1'
    print('   _ Units converted from mm/s to m/d')
    
    #% Reproject in defined CRS
    # -------------------------
    print('\nReprojecting...')
    print('   _ From epsg:{} to epsg:{}'.format(str(EPSG_in), str(EPSG_out)))
    
    # Resolution
    coord1 = list(data_ds.coords)[1]
    coord2 = list(data_ds.coords)[2]
    if reso_m is None:
        print('   _ Input resolution is taken as output resolution based on {}'.format(coord1))
        reso = float(data_ds[coord1][1] - data_ds[coord1][0])
        if data_ds[coord1].units[0:6].casefold() == 'degree':
            reso_m = round(reso*111200)
        elif data_ds[coord1].units[0:1].casefold() == 'm':
            reso_m = reso
    print('   _ Resolution: {} km'.format(reso_m/1000))
    
# =============================================================================
#         data_coords_conv = rasterio.warp.transform(rasterio.crs.CRS.from_epsg(EPSG_in), 
#                                                    rasterio.crs.CRS.from_epsg(EPSG_out), 
#                                                    [data_ds.x.values.min()], [data_ds.y.values.max()])
#         if reso_m == 75:
#             x_min = ac.nearest(x = data_coords_conv[0][0] - reso_input)
#             y_max = ac.nearest(x = data_coords_conv[1][0] + reso_input)
#         else:
#             x_min = (data_coords_conv[0][0] - reso_input) // reso_m * reso_m - reso_m/2
#             y_max = (data_coords_conv[1][0] + reso_input) // reso_m * reso_m + reso_m/2    
# =============================================================================
    
    
    if coords_extent == 'from_input':
        reprj_ds = data_ds.rio.reproject(
            dst_crs = 'epsg:'+str(EPSG_out), 
            resampling = rasterio.enums.Resampling(5),
            )
        print('   _ Data reprojected based on the extent and resolution of input data')
        print('     (user-defined resolution is not taken into account)')
    
    else: # In all other cases, x_min, x_max, y_max and y_min will be determined before .rio.reproject()
        if type(coords_extent) == str & coords_extent.casefold() in ["bretagne", "brittany"]:
            # Emprise sur la Bretagne
            if EPSG_out == 3035:   #LAEA
                x_min = 3164000
                x_max = 3572000
                y_max = 2980000
                y_min = 2720000
            elif EPSG_out == 2154: #Lambert-93
                x_min = 70000
                x_max = 460000
                y_max = 6909000
                y_min = 6651000
            elif EPSG_out == 4326: #WGS84
                x_min = -5.625
                x_max = -0.150
                y_max = 49.240
                y_min = 46.660
            print('   _ Extent based on Brittany')
        
        elif type(coords_extent) == list:
            [x_min, x_max, y_min, y_max] = coords_extent
            print('   _ User-defined extent')
        
        # Alignement des coordonnées selon la résolution (valeurs rondes)
        x_min = nearest(x = x_min, res = reso_m, x0 = 0)
        x_max = nearest(x = x_max, res = reso_m, x0 = 0)
        y_max = nearest(y = y_max, res = reso_m, y0 = 0)
        y_min = nearest(y = y_min, res = reso_m, y0 = 0)
        print("   _ Bounds:")
        print("               {}".format(y_max))
        print("      {}           {}".format(x_min, x_max))
        print("               {}".format(y_min))
        
        transform_ = Affine(reso_m, 0.0, x_min,  
                            0.0, -reso_m, y_max)
        
# =============================================================================
#         width_ = int(data_ds.x.shape[0] * reso_input / reso_m //1 +1)
#         height_ = int(data_ds.y.shape[0] * reso_input / reso_m //1 +1)
# =============================================================================
        
        dst_height = (y_max - y_min)//reso_m
        dst_width = (x_max - x_min)//reso_m
        
        reprj_ds = data_ds.rio.reproject(
            dst_crs = 'epsg:'+str(EPSG_out), 
            shape = (dst_height, dst_width),
            transform = transform_, 
            resampling = rasterio.enums.Resampling(5),
            # nodata = np.nan,
            )


# =============================================================================
#         # Adding attributes:
#         ds_reprj.rio.write_crs("epsg:"+str(EPSG_out), inplace = True)
#         ds_reprj.x.attrs = {'standard_name': 'projection_x_coordinate',
#                                     'long_name': 'x coordinate of projection',
#                                     'units': 'Meter'}
#         ds_reprj.y.attrs = {'standard_name': 'projection_y_coordinate',
#                                     'long_name': 'y coordinate of projection',
#                                     'units': 'Meter'}
#         ds_reprj.soildepth.attrs = {'grid_mapping': "spatial_ref"}
# =============================================================================
    
# =============================================================================
#         #% Convert values units :
#         # from [kg m-2s-1] to [mm d-1] 
#         ds_reprj[var] = ds_reprj[var] *60*60*24
#         ds_reprj.pr.attrs['units'] = 'mm.d-1'
#         # -> plutôt appliquer un facteur de conversion dans les settings de CWatM
# =============================================================================

    #% Export :
    print("\nPreparing export, formating encodings and attributes...")
    print('   _ Output file type: {}'.format(file_extension))
    
    reprj_ds[coord1].encoding = data_ds[coord1].encoding
    reprj_ds[coord2].encoding = data_ds[coord2].encoding
    print("   _ Coords attributes transferred")
    
    if 'original_shape' in reprj_ds[var].encoding.keys():
        reprj_ds[var].encoding.pop('original_shape')
        reprj_ds[coord1].encoding.pop('original_shape')
        reprj_ds[coord2].encoding.pop('original_shape')
        print("   _ 'original_shape' has been removed")
    
    
    # If it is the PrecipitationMaps, it is necessary to erase the x and y
    # '_FillValue' encoding, because precipitation inputs are taken as 
    # format model by CWatM
    if var == 'tp': 
        reprj_ds.x.encoding['_FillValue'] = None
        reprj_ds.y.encoding['_FillValue'] = None
        print("   _ Fill Value removed from x and y")
    
    # reprj_ds.x.encoding['_FillValue'] = None
    # reprj_ds.y.encoding['_FillValue'] = None
    # reprj_ds[var].encoding['_FillValue'] = np.nan
    
    # Compression (not lossy)
    reprj_ds[var].encoding['zlib'] = True
    reprj_ds[var].encoding['complevel'] = 4
    reprj_ds[var].encoding['contiguous'] = False
    reprj_ds[var].encoding['shuffle'] = True
    print('   _ Compression x4 without loss (zlib)')
    
    output_file = os.path.join(input_folder,
                               ' '.join([file_name, 
                                         'res='+str(reso_m)+'m', 
                                         'CRS='+str(EPSG_out),
                                         ]) + file_extension)
    
    reprj_ds.to_netcdf(output_file)      
    """
    La compression (2x plus petit) fait planter QGIS lorsqu'il essaye
    de lire le fichier comme un maillage.
    Il est nécessaire d'effacer la _FillValue des coordonnées x et y.
    """
    
    print('\n -> Successfully exported to:\n' + output_file)
    
    return reprj_ds


#%%% ° CORINE Land Cover
def corine(data):
    (folder_name, file_name) = os.path.split(data)
    (file_name, extension) = os.path.splitext(file_name)
    
    data_years = [1990, 2000, 2006, 2012, 2018]
    y = data_years[4]
    
    land_convert = [
        [23, 24, 25, 26, 27, 28, 29], # 1. forests
        [12, 15, 16, 17, 18, 19, 20, 21, 22], # 2. grasslands and non-irrigated crops
        [14], # 3. rice fields
        [13, ], # 4. irrigated crops
        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], # 5. sealed lands
        [40, 41, 42, 43], # 6. water surfaces
        # [7, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39], # x. sol nu
        ]
    
# =============================================================================
#         with xr.open_dataset(data) as data_ds:
#             data_ds.load() # to unlock the resource
# =============================================================================
    
    #%%%% Découpe du fichier (mondial) sur la Bretagne 
    # -----------------------------------------------
    with rasterio.open(data, 'r') as data:
        # data_profile = data.profile
        # data_crs = data.crs # epsg:3035
        # data_val = data.read()[0] # np.ndarray
        #     # 46000 lignes
        #     # 65000 colonnes
        # data_dtype = data.dtypes[0] # int8
        
        # Définition du périmètre de découpe
# =============================================================================
#             shapes = Polygon([
#                 (3188000, 2960000),
#                 (3188000 + 364000, 2960000),
#                 (3188000 + 364000, 2960000 - 231000),
#                 (3188000, 2960000 - 231000),
#                 (3188000, 2960000)])
# =============================================================================
# =============================================================================
#             shapes = rasterio.warp.transform_geom(
#                 {'init': 'epsg:3035'},
#                 data.crs,
#                 {'type': 'Polygon',
#                  'coordinates': [[
#                      (3188000, 2960000),
#                      (3188000 + 364000, 2960000),
#                      (3188000 + 364000, 2960000 - 231000),
#                      (3188000, 2960000 - 231000),
#                      (3188000, 2960000)]]}
#                 )
# =============================================================================
        with fiona.open(r"D:\2- Postdoc\2- Travaux\2- Suivi\2- CWatM\2022-01-12) Synthèse inputs\Mask_Bretagne_3035_1km.shp", "r") as shapefile:
            shapes = [feature["geometry"] for feature in shapefile]
        
        # Découpe
        out_raster, out_transform = rasterio.mask.mask(data, 
                                                      shapes, 
                                                      crop = True)
        
        crp_data = out_raster[0]
        
        # Mise à jour des attributs
        # data_profile = data.meta
        data_profile = data.profile
        data_profile.update({
            # "driver": "Gtiff",
        	"height": crp_data.shape[0],
        	"width": crp_data.shape[1],
        	# "nodata": -128,
        	"transform": out_transform})
        
        # Export (fichier facultatif)
        output_file = os.path.join(folder_name, file_name + '_Brittany.tif')
        with rasterio.open(output_file, 'w', **data_profile) as output_f:
            output_f.write_band(1, crp_data)
        print('\nCropped file has been successfully exported')
        
    
    #%%%% Extraction des types d'occupation du sol
    # -------------------------------------------
    data_profile['dtype'] = 'float32' # pour limiter la taille sur disque
    
    for i in range(0,6):
        # Création d'une carte de mêmes dimensions, avec la valeur 1 sur
        # les pixels correspondant au landcover voulu, et 0 sur les autres
        print('\nClass ' + str(i+1) + ":")
        count = np.zeros(crp_data.shape, dtype = data_profile['dtype'])
        count = count.astype('float32') # pour limiter la taille sur disque
        
        for code in land_convert[i]:
            # count[crp_data == code] += 1 # Normalement ça ne dépasse jamais 1
            count[crp_data == code] = 1
        
        # Export en résolution initiale (fichiers facultatifs)
        output_file = os.path.join(folder_name, file_name + '_class' + str(i+1) + '.tif')
        with rasterio.open(output_file, 'w', **data_profile) as output_f:
            output_f.write_band(1, count)
        print('   - Successfully exported into initial-resolution *.tif')

      

        # Reprojection and conversion of counts into percentages
        # dst_trsfm = Affine(1000, 0, 900000,
        #                    0, -1000, 5500000)
        # dst_trsfm = Affine(1000, 0, 3188000,
        #                    0, -1000, 2960000)
        dst_trsfm = Affine(1000, 0, 3165000,
                           0, -1000, 2954000)
        
        # data_reprj = np.zeros((4600, 6500), dtype = np.float32)
        # data_reprj = np.zeros((231, 364), dtype = np.float32)
        data_reprj = np.zeros((258, 407), dtype = np.float32)
        
        dst_data_profile = data_profile.copy()
        dst_data_profile.update({
        	"height": data_reprj.shape[0],
        	"width": data_reprj.shape[1],
        	"transform": dst_trsfm,
            "nodata": np.nan})
        
        rasterio.warp.reproject(count, 
                                destination = data_reprj, 
                                src_transform = data_profile['transform'],
                                src_crs = data_profile['crs'],
                                src_nodata = data_profile['nodata'],
                                dst_transform = dst_data_profile['transform'],
                                dst_crs = dst_data_profile['crs'],
                                dst_nodata = dst_data_profile['nodata'],
                                resampling = rasterio.enums.Resampling(5),
                                )
        
        # Export en résolution finale (fichiers intermédiaires nécessaires)
        output_file = os.path.join(folder_name, 
                                   '_'.join([file_name, '_class' + str(i+1), '1000m.tif'])
                                   )
        with rasterio.open(output_file, 'w', **dst_data_profile) as output_f:
            output_f.write_band(1, data_reprj)
        print('   - Successfully exported into coarse-resolution *.tif')
        
        
    #%%%% Conversion des derniers fichiers créés (*.tif) en *.nc
    # ---------------------------------------------------------
    var_names = [
        'fracforest',
        'fracgrassland',
        'fracirrNonPaddy',
        'fracirrPaddy',
        'fracsealed',
        'fracwater',
        ]
    
    # Initialization:
    i = 0
    with xr.open_dataset(os.path.join(folder_name, '_'.join([file_name, '_class' + str(i+1), '1000m.tif']))) as data_ds:
        data_ds.load()
    # data_ds = data_ds.squeeze('band').drop('band')
    data_ds = data_ds.rename(band = 'time')
    # data_ds = data_ds.reindex(time = [datetime.datetime(2018, 1, 31)])
    data_ds = data_ds.assign_coords({'time': ('time', [datetime.datetime(y, 1, 31)])})
    data_ds = data_ds.rename(band_data = var_names[i])
 
    for i in range(1,6):
        with xr.open_dataset(os.path.join(folder_name, '_'.join([file_name, '_class' + str(i+1), '1000m.tif']))) as data_tmp:
            data_tmp.load()
        # data_tmp = data_tmp.squeeze('band').drop('band')
        data_tmp = data_tmp.rename(band = 'time')
        # data_tmp = data_tmp.reindex(time = [datetime.datetime(2018, 1, 31)])
        data_tmp = data_tmp.assign_coords({'time': ('time', [datetime.datetime(y, 1, 31)])})
        data_tmp = data_tmp.rename(band_data = var_names[i])
        data_ds[var_names[i]] = data_tmp[var_names[i]]
    
    # Rectification de la normalisation des pourcentages
# =============================================================================
#         # data_ds.fillna(0)
#         sum_norm = data_ds[var_names[0]]+data_ds[var_names[1]]+data_ds[var_names[2]]+data_ds[var_names[3]]+data_ds[var_names[4]]+data_ds[var_names[5]]
#         for i in range(0,6):
#             data_ds[var_names[i]] = data_ds[var_names[i]]/sum_norm
# =============================================================================
    
    data_ds.rio.write_crs('epsg:3035', inplace = True)
    data_ds.x.attrs = {'standard_name': 'projection_x_coordinate',
                        'long_name': 'x coordinate of projection',
                        'units': 'Meter'}
    data_ds.y.attrs = {'standard_name': 'projection_y_coordinate',
                        'long_name': 'y coordinate of projection',
                        'units': 'Meter'}
    output_file = os.path.join(folder_name, 'fractionLandcover_Bretagne_CORINE_{}.nc'.format(str(y)))
    
    # NB : Il est nécessaire de faire disparaître les valeurs nan
# =============================================================================
#         data_ds['fracforest'].encoding['_FillValue'] = 0 # ou None ??
#         data_ds['fracgrassland'].encoding['_FillValue'] = 0 # ou None ??
#         data_ds['fracirrNonPaddy'].encoding['_FillValue'] = 0 # ou None ??
#         data_ds['fracirrPaddy'].encoding['_FillValue'] = 0 # ou None ??
#         data_ds['fracsealed'].encoding['_FillValue'] = 0 # ou None ??
#         data_ds['fracwater'].encoding['_FillValue'] = 0 # ou None ??
# =============================================================================
    
    data_ds.to_netcdf(output_file)

            
#%%% ° Observatoire botanique de Brest (2018-2019)
def obb(data):
    land_convert = [
        [0, 0, 0], # 1. forests
        [0, 0, 0], # 2. grasslands and non-irrigated crops
        [0, 0, 0], # 3. rice fields
        [0, 0, 0], # 4. irrigated crops
        [0, 0, 0], # 5. sealed lands
        [0, 0, 0], # 6. water surfaces
        # [7, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39], # x. sol nu
        ]
    return 'Pas encore implémenté'


#%%% ° CES OSO
def cesoso(data):
    return "Pas encore implémenté"
    
#%%% ° Soil depth from UCS Bretagne
def ucsbretagne(data):
    data_df = gpd.read_file(data) # data_df = soil depth
    data_epsg = data_df.crs.to_epsg()
    
    #% Calcul des profondeurs locales moyennes pondérées
    print('\nComputing local weighted depth averages...')
    soil_depth_limit = 140 # valeur semi-arbitraire
    class_depth_convert = {'0':0,
                           '1':soil_depth_limit - (soil_depth_limit-100)/2,
                           '2':90,
                           '3':70,
                           '4':50,
                           '5':30,
                           '6':10,
                           }
    data_df['EPAIS_1_NUM'] = [class_depth_convert[val] for val in data_df['EPAIS_DOM'].values] 
    data_df['EPAIS_2_NUM'] = [class_depth_convert[val] for val in data_df['EPAIS_2'].values] 
    data_df['EPAIS_1_2'] = (data_df['EPAIS_1_NUM'] * data_df['P_EPAISDOM'] + data_df['EPAIS_2_NUM'] * data_df['P_EPAIS2']) / (data_df['P_EPAISDOM'] + data_df['P_EPAIS2'])
    data_df['EPAIS_1_2'] = data_df['EPAIS_1_2']/100 # Convertir cm vers m
    
    #% Reprojection dans le CRS voulu
    print('\nReprojecting...')
    print('   from epsg:{} to epsg:{}'.format(str(data_epsg), str(EPSG_out)))
    data_df.to_crs('epsg:'+str(EPSG_out), inplace = True)
    
    #% Rasterisation
    print('\nRasterizing...')
    x_min = 74000
    y_max = 6901000
    bounds_conv = rasterio.warp.transform(rasterio.crs.CRS.from_epsg(2154), 
                                          rasterio.crs.CRS.from_epsg(EPSG_out), 
                                          [74000], [6901000]) # Limites (larges) de la Bretagne en L93
    x_min = bounds_conv[0][0] // reso_m * reso_m
    y_max = bounds_conv[1][0] // reso_m * reso_m
    
    # Décalage d'un demi pixel
    if reso_m == 75:
        x_min = nearest(x = x_min) - 75/2
        y_max = nearest(x = y_max) + 75/2
    else:
        x_min = x_min - reso_m/2
        y_max = y_max + reso_m/2
    
    transform_ = Affine(reso_m, 0.0, x_min,  
                        0.0, -reso_m, y_max)
    
# =============================================================================
#         transform_ = Affine(reso_m, 0.0, x_min,  
#                             0.0, reso_m, 670500)
# =============================================================================
    
    width_ = 377000 // reso_m + 1
    height_ = 227000 // reso_m + 1
    
    raster_data = rasterio.features.rasterize(
        [(val['geometry'], val['EPAIS_1_2']) for i, val in data_df.iterrows()],
        out_shape = (height_, width_),
        transform = transform_,
        fill = np.NaN,
        all_touched = True)
    
    #% Convertir en DataSet
    print('\nConverting into xarray.Dataset...')
    # Converting to xarray dataarray:
    dataarray_ = xr.DataArray(raster_data, 
                              coords = [np.arange(y_max,
                                                  y_max - reso_m * height_,
                                                  -reso_m).astype(np.float32),
                                        np.arange(x_min, 
                                                  x_min + reso_m * width_, 
                                                  reso_m).astype(np.float32)], 
                              dims = ['y', 'x'])
    # NB : La fonction 'range()' est évitée car elle renvoie des entiers
    # QGIS n'arrive pas à détecter le CRS lorsque les axes sont des entiers
    
# =============================================================================
#         dataarray_ = xr.DataArray(raster_data.transpose(), 
#                                   coords = [range(x_min, 
#                                                   x_min + reso_m * width_, 
#                                                   reso_m),
#                                             range(y_max,
#                                                   y_max - reso_m * height_,
#                                                   -reso_m)], 
#                                   dims = ['x', 'y'])
# =============================================================================
    
    # Converting to xarray dataset:
    dataset_ = dataarray_.to_dataset(name = 'soildepth')
    
    # Adding attributes:
    dataset_.rio.write_crs("epsg:"+str(EPSG_out), inplace = True)
    dataset_.x.attrs = {'standard_name': 'projection_x_coordinate',
                                'long_name': 'x coordinate of projection',
                                'units': 'Meter'}
    dataset_.y.attrs = {'standard_name': 'projection_y_coordinate',
                                'long_name': 'y coordinate of projection',
                                'units': 'Meter'}
    dataset_.soildepth.attrs = {'grid_mapping': "spatial_ref"} # A supprimer ???
    # dataset_.assign_coords(spatial_ref = dataset_.spatial_ref)
    # dataset_.rio.write_crs("epsg:"+str(EPSG_out), inplace = True)
    

    #% Export :
    print('\nCompressing and saving...')
    
    full_file_name = ' '.join([file_name, 
                                 'res='+str(reso_m)+'m', 
                                 'CRS='+str(EPSG_out),
                                 'lim='+str(soil_depth_limit),
                                 ])
    output_file = os.path.join(input_folder, 
                               full_file_name + '.nc')
    print('   --> result exported to:\n   --> ' + output_file)
    
    var = main_var(dataset_)
    # dataset_[var].encoding['_FillValue'] = None # ou np.nan ?
    # ERREUR : dataset_.to_netcdf(output_file, encoding = {var: {'zlib': True, 'dtype': 'float32', 'scale_factor': 0.00000001, '_FillValue': np.nan},})
    dataset_.to_netcdf(output_file)
    
    #% Fichiers pour CWatM
    topsoil_ds = dataset_.where(dataset_['soildepth']<0.30, 0.30)
    topsoil_ds = topsoil_ds.where(~np.isnan(dataset_), np.nan)
    # topsoil_ds = topsoil_ds.where(~np.isnan(dataset_), 0)
    subsoil_ds = dataset_ - topsoil_ds
    output_top = os.path.join(input_folder, 
                              'soildepth1_' + full_file_name + '.nc')
    output_sub = os.path.join(input_folder, 
                              'soildepth2_' + full_file_name + '.nc')
    
    # topsoil_ds.to_netcdf(output_top, encoding = {var: {'zlib': True, 'dtype': 'float32', 'scale_factor': 0.00000001, '_FillValue': np.nan},})
    topsoil_ds.to_netcdf(output_top)
    # subsoil_ds.to_netcdf(output_sub, encoding = {var: {'zlib': True, 'dtype': 'float32', 'scale_factor': 0.00000001, '_FillValue': np.nan},})
    subsoil_ds.to_netcdf(output_sub)
    """
    La compression (27x plus petit) fait planter CWatM lorsqu'il utilise
    l'option use_soildepth_as_GWtop
    """

    
    return dataset_
    
#%%% ° Crop coefficients, version "better"
def cropcoeff(data):
    #%%%% Initialization:
    landcover_motif = re.compile('cropCoefficient[a-zA-Z]*')
    pos = landcover_motif.search(file_name).span()
    landcover = file_name[15:pos[1]]
    name = os.path.splitext(file_name)[0][0:pos[1]]
    print(f'\nProcessed landcover = {landcover}')
    print("___________________")
    
    
    with xr.open_dataset(
            data,
            decode_coords = 'all',
            ) as data:
        data.load()
    
    #%%%% Reprojection:
    print('\nReprojecting...')
    
    if not EPSG_in: # EPSG = None, by default
        if 'spatial_ref' not in list(data.coords) + list(data.data_vars):
            print("   Le SCR n'est pas renseigné dans les données initiales. Par défaut, le WGS84 est utilisé")
            EPSG_in = 4326
        else:
            EPSG_in = data.rio.crs.to_epsg()
    print("   From epsg:{} to espg:{}".format(EPSG_in, EPSG_out))
    data.rio.write_crs("epsg:{}".format(EPSG_in), inplace = True)
    
    # Emprise sur la Bretagne
    if EPSG_out == 3035:   #LAEA
        x_min = 3164000
        x_max = 3572000
        y_max = 2980000
        y_min = 2720000
    elif EPSG_out == 2154: #Lambert-93
        x_min = 70000
        x_max = 460000
        y_max = 6909000
        y_min = 6651000
# =============================================================================
#         # To properly implement management of lat/lon, reso_m should be upgraded
#         # with a system including reso_deg (future developments)
#         elif EPSG_out == 4326: #WGS84 (in this case, reso_m should be in ° instead of m)
#             x_min = -5.625
#             x_max = -0.150
#             y_max = 49.240
#             y_min = 46.660
# =============================================================================

    # Alignement des coordonnées selon la résolution (valeurs rondes)
    x_min = nearest(x = x_min, res = reso_m, x0 = 0)
    x_max = nearest(x = x_max, res = reso_m, x0 = 0)
    y_max = nearest(y = y_max, res = reso_m, y0 = 0)
    y_min = nearest(y = y_min, res = reso_m, y0 = 0)
    print("   Final bounds:")
    print("               {}".format(y_max))
    print("      {}           {}".format(x_min, x_max))
    print("               {}".format(y_min))

    # Reproject
    if reso_m is None:
        coord1 = list(data.coords)[1]
        print('NB : La resolution est déterminée à partir des {}\n'.format(coord1))
        reso = float(data[coord1][1] - data[coord1][0])
        if data[coord1].units[0:6].casefold() == 'degree':
            reso_m = round(reso*111200)
        elif data[coord1].units[0:1].casefold() == 'm':
            reso_m = reso
    print('   Resolution: {} km'.format(str(reso_m/1000)))
    
    dst_height = (y_max - y_min)//reso_m
    dst_width = (x_max - x_min)//reso_m
    
    data_reprj = data.rio.reproject(
        dst_crs = 'epsg:'+str(EPSG_out), 
        # resolution = (1000, 1000),
        shape = (dst_height, dst_width),
        transform = Affine(reso_m, 0.0, x_min,
                           0.0, -reso_m, y_max), 
        resampling = rasterio.enums.Resampling(5),
        nodata = np.nan)
    
    
    #%%%% Preparing export:
    print("\nPreparing export...")
# =============================================================================
#         # Change the order of coordinates to match QGIS standards:
#             # Normally, not necessary
#         data_reprj = data_reprj.transpose('time', 'y', 'x')
#         print("      order of coordinates = 'time', 'y', 'x'")
# =============================================================================
    # Formatting to match standards (encodings and attributes)
    print("   Formating encodings and attributes")
# =============================================================================
#         for c in ['time', 'latitude', 'longitude']:
#             data_reprj[c].attrs = data[c].attrs
#             data_reprj[c].encoding = data[c].encoding
#         for f in _fields_intersect:
#             data_reprj[f].attrs = data[f].attrs
#             data_reprj[f].encoding = data[f].encoding
# =============================================================================
    # Insert georeferencing metadata to match QGIS standards:
    # data_reprj.rio.write_crs("epsg:4326", inplace = True)
    # print("      CRS set to epsg:4326")
            
    data_reprj.x.encoding = data.lon.encoding
    data_reprj.y.encoding = data.lat.encoding
    data_reprj.x.encoding.pop('original_shape')
    data_reprj.y.encoding.pop('original_shape')
    
    var = geo.main_var(data)
    
    # If it is the PrecipitationMaps, it is necessary to erase the x and y
    # '_FillValue' encoding, because precipitation inputs are taken as 
    # format model by CWatM
    if var == 'tp': 
    # if name == 'Total precipitation daily_mean':
        data_reprj.x.encoding['_FillValue'] = None
        data_reprj.y.encoding['_FillValue'] = None
    
    # Compression (not lossy)
    data_reprj[var].encoding = data[var].encoding
    data_reprj[var].encoding.pop('chunksizes')
    data_reprj[var].encoding.pop('original_shape')
# =============================================================================
#         data_reprj[var].encoding['zlib'] = True
#         data_reprj[var].encoding['complevel'] = 4
#         data_reprj[var].encoding['contiguous'] = False
#         data_reprj[var].encoding['shuffle'] = True
# =============================================================================
    # NB: The packing induces a loss of precision of apprx. 1/1000 of unit,
    # for a quantity with an interval of 150 units. The packing is
    # initially used in the original ERA5-Land data.    
    
    print("   All attributes and encodings transfered")
    

    #%%%% Export :
    print('\nSaving...')          
    output_file = os.path.join(input_folder,
                               '_'.join([file_name, 
                                        'res='+str(reso_m)+'m', 
                                        'epsg'+str(EPSG_out),
                                         ]) + '.nc')        

    data_reprj.to_netcdf(output_file)
    print("\nLe resultat a bien été exporté dans le fichier : {}".format(output_file))
    
    return data_reprj


###############################################################################
#%%% * Formate les données RHT
def process_rht(shp_file, attrs_file, fields = 'all'):
    """
    Pour rajouter les attributs dans le shapefile, de anière à pouvoir 
    l'afficher facilement dans QGIS.

    Parameters
    ----------
    shp_file : str
        Chemin vers le fichier shapefile
    attrs_file : str
        Chemin vers la table externe d'attributs
    fields : list
        Liste des noms de colonnes que l'on veut insérer dans le shapefile

    Returns
    -------
    Create a new file.
    """
    
    # Inputs
    # ------
    gdf_in = gpd.read_file(shp_file)
    
    df_attrs = pd.read_csv(attrs_file, sep = r";", header = 0, skiprows = 0)
    
    if fields == 'all': fields = list(df_attrs.columns)
    if not isinstance(fields, list): fields = [fields]
    if 'id_drain' not in fields: fields.append('id_drain')
    print('Fields :\n{}'.format(fields))

    # Fusion
    # ------
    gdf_out = gdf_in.merge(df_attrs[fields], left_on = "ID_DRAIN", right_on = "id_drain")
    
    # Export
    # ------
    out_name = os.path.splitext(shp_file)[0] + '_merged.shp'
    gdf_out.to_file(out_name)
    
    return gdf_out    


###############################################################################
#%%% * convert_from_h5_newsurfex (with recessions)
def convert_from_h5_newsurfex(*, input_file, mesh_file, scenario = 'historic', 
                    output_format = 'NetCDF', **kwargs):
    r"""
    % DESCRIPTION:
    This function converts Ronan *.h5 files from SURFEX into NetCDF files, or
    GeoTIFF images, in order to make output files readable by QGIS.
    
    % EXAMPLE:
    >> import geoconvert as gc
    >> gc.convert_from_h5_newsurfex(input_file = r"D:\2- Postdoc\2- Travaux\1- Veille\4- Donnees\8- Meteo\Surfex\BZH\REA.h5",
                          mesh_file = r"D:\2- Postdoc\2- Travaux\1- Veille\4- Donnees\8- Meteo\Surfex\BZH\shapefile\maille_meteo_fr_pr93.shp",
                          output_format = "NetCDF",
                          fields = ["REC", "TAS"])
    
    % ARGUMENTS:
    >    
    
    % OPTIONAL ARGUMENTS:
    > output_format = 'NetCDF' (défault) | 'GeoTIFF'
    > scenario = 'historic' | 'RCP2.6' | 'RCP4.5' | 'RCP8.5'
    > kwargs:
        > fields = variable(s) to conserve (among all attributes in input file)
                 = ['ETP', 'PPT', 'REC', 'RUN', 'TAS']
        (One file per variable will be created.)
        
        > dates = for instance : ['2019-01-01', '2019-01-02', ...]
        (If no date is specified, all dates from input file are considered)
    """
    
    #% Get fields argument:
    # ---------------------
    if 'fields' in kwargs:
        fields = kwargs['fields']
        if isinstance(fields, str): fields = [fields]
        else: fields = list(fields) # in case fields are string or tuple
    else:
        print('\n/!\ Fields need to be specified\n')
        
    
    
    for _field in fields:
        print('Processing ' + _field + '...')
        print('   Loading data...')
        #% Load data sets:
        # ----------------
        # _data contains the values, loaded as a pandas dataframe
        _data = pd.read_hdf(input_file, _field + '/' + scenario)
        _data.index.freq = 'D' # Correct the frequency of timestamp, because it
                               # is altered during importation
        _data = _data.iloc[:, 0:-1] # Get rid of the last 'MEAN' column
        _data = _data.transpose()
        
        # _dataset contains the spatial metadata, and will be used as the
        # final output structure
        _dataset = gpd.read_file(mesh_file)
        _dataset.num_id = _dataset.num_id.astype(int) # Convert 'num_id' (str) 
                                                      # into integer
        
        # Only cells present in _data are considered:
        _datasubset = _dataset[_dataset.num_id.isin(_data.index)]
        _datasubset = _datasubset.merge(_data, left_on = "num_id", 
                                        right_index = True)
        _datasubset.index = _datasubset.num_id # replace indexes by id_values
        
        #% Get dates argument: (and build output_name)
        # --------------------
        _basename = os.path.splitext(input_file)[0]
        
        if 'dates' in kwargs:
            dates = kwargs['dates']
            if isinstance(dates, str): 
                output_file = '_'.join([_basename, dates, _field])
                dates = [dates]
            else: 
                dates = list(dates) # in case dates are tuple
                output_file = '_'.join([_basename, dates[0], 'to', dates[-1], _field])
        else:
            dates = _data.columns # if not input_arg, dates = all
            output_file = '_'.join([_basename, scenario, _field])
        
        #% Exports:
        # ---------       
        profile_base = {'driver': 'NetCDF',
                        'dtype': 'float32',
                        'nodata': None,
                        'width': 37,
                        'height': 23,
                        'count': 1,
                        # 'count': len(dates),
                        'crs': rio.crs.CRS.from_epsg(27572),
                        'transform': Affine(8000, 0.0, 56000, 0.0, 8000, 2261000),
                        # values deducted from Xlamb and Ylamb
                        # can also be found from QGIS: "X (ou Y) du sommet le plus proche"
                        'tiled': False,
                        'interleave': 'band'}
        
    # =============================================================================
    #     # Getting bounds from polygons is less precise:
    #     _datasubset.geometry.total_bounds[0:1]
    #     # >> [ 107438.43514434074, 6697835.528522245 ] in epsg:2154
    #     Affine(8000, 0.0, 56092.06862427108, 0.0, 8000, 2260917.5598924947)
    #     # more precise values: 57092.06862427143, 2259917.559892499
    #     #   --> does not improve much...
    # =============================================================================
    
    # =============================================================================
    #     # Using polygon limits instead of polygon centers is less precise:
    #     rasterized_data = rasterio.features.rasterize(
    #         [(_dataval.loc['geometry'], _dataval.loc[pd.to_datetime(_date)]) for i, _dataval in _datasubset.to_crs(27572).iterrows()],
    #         out_shape = (profile_base['height'], profile_base['width']),
    #         transform = profile_base['transform'],
    #         fill = 0,
    #         all_touched = True)
    #         # dtype = rasterio.uint8
    # =============================================================================
        
        #% Add coordinates of the center of each polygon, as a POINT object:
        # ------------------------------------------------------------------
        _datasubset['centers'] = [Point(x_y) 
                                  for x_y in zip(_datasubset.loc[:, 'Xlamb'], 
                                                 _datasubset.loc[:, 'Ylamb'])]
        # NB: Coordinates are here in Lambert Zone II (epsg:27572)    
        
# =============================================================================
#    #{a}     # Previous version, creating a raster per date (very slow):
#             rasterized_data = rasterio.features.rasterize(
#             [(_dataval.loc['centers'], _dataval.loc[pd.to_datetime(_date)]) 
#               for i, _dataval in _datasubset.iterrows()],
#             out_shape = (profile_base['height'], profile_base['width']),
#             transform = profile_base['transform'],
#             fill = np.NaN,
#             all_touched = True)
#             # dtype = rasterio.uint8
# =============================================================================
        
        #% Create a raster mask:
        # ---------------------
        rasterized_mask = rasterio.features.rasterize(
            [(_dataval.loc['centers'], _dataval.loc['num_id']) 
             for i, _dataval in _datasubset.iterrows()],
            out_shape = (profile_base['height'], profile_base['width']),
            transform = profile_base['transform'],
            fill = np.NaN,
            all_touched = True)
            # dtype = rasterio.uint16
        
# =============================================================================
#         #% Apercu visuel rapide :
#         plt.imshow(rasterized_mask, origin = 'lower')
# =============================================================================
        
        #% Build a xarray based on the previous mask:
        # -------------------------------------------
        print('   Building an xarray...')
        # Initialization:
        array3D = np.full((len(dates), 
                           profile_base['height'], 
                           profile_base['width']), np.nan)
        
        # Filling:
        for (_y, _x), _id_val in np.ndenumerate(rasterized_mask):
            if not np.isnan(_id_val):
                array3D[:, _y, _x] = _datasubset.loc[_id_val].iloc[10:-1]
        
        # Converting to xarray dataarray:
        dataarray = xr.DataArray(array3D, 
                                  coords = [dates, 
                                            np.arange(2265000,
                                                      2265000 + 8000*profile_base['height'],
                                                      8000).astype(np.float32),
                                            np.arange(60000, 
                                                      60000 + 8000*profile_base['width'], 
                                                      8000).astype(np.float32)],
        # NB : La fonction 'range()' est évitée car elle renvoie des entiers
        # QGIS n'arrive pas à détecter le CRS lorsque les axes sont des entiers
                                  dims = ['time', 'y', 'x'])
        
        # Converting to xarray dataset:
        dataset = dataarray.to_dataset(name = _field)
        
        # Adding attributes:
        dataset.rio.write_crs("epsg:27572", inplace = True)
        dataset.x.attrs = {'standard_name': 'projection_x_coordinate',
                                    'long_name': 'x coordinate of projection',
                                    'units': 'Meter'}
        dataset.y.attrs = {'standard_name': 'projection_y_coordinate',
                                    'long_name': 'y coordinate of projection',
                                    'units': 'Meter'}
        
        # Export:
        # -------
        if output_format.casefold() in ['nc', 'netcdf']:
            dataset.to_netcdf(output_file + '.nc')
            (folder_name, file_name) = os.path.split(output_file + '.nc')
            print("   The dataset has been successfully exported to the file '{}' in the folder '{}'\n".format(file_name, folder_name))
            
        elif output_format.casefold() in ['tif', 'tiff', 'geotiff']:
            with rasterio.open(output_file + '.tiff', 'w', **profile_base) as out_f:
                for i, d in enumerate(dates):
                    out_f.write_band(i, dataset.sel(time = d))
            (folder_name, file_name) = os.path.split(output_file + '.nc')
            print("   The dataset has been successfully exported to the file '{}' in the folder '{}'\n".format(file_name, folder_name))

        
    return fields
    
    
# =============================================================================
#     #- - - - - - SANDBOX - - - - - 
#     #% Single day:
#     # date = _data.loc[:,_data.columns == '2019-10-15'].columns.values
#     date = _data.loc[:,_data.columns == _date].columns[0]
#     _datasingleday = _datasubset.loc[:, date]
#     
#     #% Matplot:
#     _datasubset.plot(column = date, cmap = 'RdYlBu_r', vmin = 10.5, vmax = 15)
# =============================================================================


#%%% * convert_from_h5_oldsurfex (data from Quentin)
###############################################################################
def convert_from_h5_oldsurfex(*, output_format = 'csv', **kwargs):
    r"""
    % DESCRIPTION :
    Cette fonction formate les fichiers Surfex de Quentin en fichiers *.csv
    organisés par dates (lignes) et identifiants de mailles (colonnes).
    
    % EXEMPLES :
    >> import surfexconvert as sc  #(NB : il faut au préalable que le dossier soit ajouté dans le PYTHONPATH))
    >> sc.convert_from_h5_oldsurfex(output_format = "csv", 
                                 start_years = list(range(2005, 2011, 1)),
                                 variables = ['DRAIN', 'ETR'])
            
    >> sc.convert_from_oldsurfex(output_format = "nc", 
                                 mesh_file = r"D:\2- Postdoc\2- Travaux\1- Veille\4- Donnees\8- Meteo\Surfex\BZH\shapefile\maille_meteo_fr_pr93.shp")

    % ARGUMENTS (OPTIONNELS) :
    > output_format = 'csv' (défault) | 'NetCDF' | 'GeoTIFF'
    > kwargs:
        > input_folder = dossier contenant les fichiers à traiter.
            Si rien n'est spécifié, le dossier du script est pris en compte
        > variables = variable(s) à traiter (parmi DRAIN, ETR, PRCP et RUNOFF)
            Si ce n'est pas spécifié, toutes les variables sont considérées
        > start_years = années à traiter
            (par ex : 2012 correspond au fichier blabla_2012_2013)
            Si rien n'est spécifié, toutes les années sont considérées
        > mesh_file = chemin d'accès au fichier de correspondances entre les
        id des tuiles et leurs coordonnées
            (nécessaire uniquement pour NetCDF et GeoTIFF)
    """   
    
    # ---- RECUPERATION DES INPUTS
    #--------------------------
    if 'input_folder' in kwargs:
        input_folder = kwargs['input_folder']
    else:
        input_folder = os.path.dirname(os.path.realpath(__file__))
    
     # Si 'variables' est renseigné, il est converti en liste de strings.
     # Sinon, toutes les variables sont prises en compte (DRAIN, ETR, PRCP...).
    if 'variables' in kwargs:
        variables = kwargs['variables']
        if isinstance(variables, str): variables = [variables]
        else: variables = list(variables)
    else:
        variables = ['DRAIN', 'ETR', 'PRCP', 'RUNOFF']
    print('> variables = ' + ', '.join(variables))
    
     # Si 'start_years' est renseigné, il est converti en liste de strings.
     # Sinon, toutes les années sont considérées (1970-1971 -> 2011-2012).
    if 'start_years' in kwargs:
        start_years = kwargs['start_years']
        if isinstance(start_years, str): start_years = [start_years]
        else: 
            start_years = list(start_years)
            start_years = [str(y) for y in start_years]
    else:
        start_years = [str(y) for y in range(1970, 2012, 1)]
    print("> années de départ = " + ', '.join(start_years))
    
    if 'mesh_file' in kwargs:
        mesh_file = kwargs['mesh_file']
    
    
    #% Localisation des dossiers et fichiers de base : 
    #-------------------------------------------------
    idlist_file = os.path.join(input_folder, 
                               r"numero_mailles_bretagne_etendue.txt")
    # (= fichier des identifiants des mailles) 
    mesh_indexes = np.loadtxt(idlist_file, dtype = int)
    
    
    # ---- TRAITEMENT DES DONNEES
    for _variable in variables:
        print("Traitement de " + _variable + "...")
        dataset_allyears = pd.DataFrame(columns = mesh_indexes) #(initialisation)
        
        for _year in start_years:
            print(_year)
            input_file = os.path.join(input_folder, 
                                      "_".join([_variable, _year, 
                                                str(int(_year)+1)])) 
            # (= fichier de données)

            #% Importation :
            #---------------
            raw_data = pd.read_table(input_file, delimiter = "   ", 
                                     engine = 'python', header = None)
            
            # Redimensionnement :
            #--------------------
            if pd.to_datetime(str(int(_year) + 1)).is_leap_year: n_days = 366
            else: n_days = 365
            reshp_array = np.array(raw_data).reshape((n_days, 610), 
                                                     order = 'C')
            reshp_array = reshp_array[:, 0:-6] # Array de 365 lignes et 604 col
            
            # Ajout des étiquettes :
            #-----------------------
            _start_date = _year + '-07-31'
            date_indexes = pd.date_range(start = _start_date, 
                                         periods = n_days, freq = '1D')
            dataframe_1y = pd.DataFrame(data = reshp_array, 
                                        index = date_indexes, 
                                        columns = mesh_indexes, copy = True)
            
            dataset_allyears = dataset_allyears.append(dataframe_1y)
            
            # =================================================================
            #     #% Rajouter des moyennes ou sommes annuelles comme sur les 
            #        précédentes données Surfex :
            #     T['mean'] = T.mean(axis = 1)
            #     T['sum'] = T.sum(axis = 1)
            # =================================================================
          
            
        # ---- EXPORTATION
        output_file = os.path.join(input_folder, 
                                   "_".join([_variable, start_years[0],
                                            str(int(start_years[-1])+1)]
                                            )
                                   )
        
        #% Export en CSV :
        #-----------------
        if output_format in ['csv', 'CSV']:
            dataset_allyears.to_csv(output_file + ".csv", sep = '\t', 
                                    header = True) 
            (folder_name, file_name) = os.path.split(output_file + ".csv")
            print("> Le fichier \'{}\' a été créé dans le dossier \'{}\'.".format(
                file_name, folder_name))
            
        
        #% Export en NetCDF ou TIFF :
        #----------------------------
        elif output_format in ['NetCDF', 'nc', 'TIFF', 'tiff', 'tif', 'GeoTIFF']:            
            #% Géoréférencement des données / formatage
            print("Formatage...")
            # (NB : dataset_allyears sera écrasé plus tard)
            all_date_indexes = dataset_allyears.index
            n_dates = len(all_date_indexes)
            _data = dataset_allyears.transpose(copy = True)
            _geodataset = gpd.read_file(mesh_file)
            _geodataset.num_id = _geodataset.num_id.astype(int)
            
            _geodataset = _geodataset[_geodataset.num_id.isin(_data.index)]
            _geodataset = _geodataset.merge(_data, left_on = "num_id", 
                                            right_index = True)
            
            res_x = 8000 # résolution Surfex [m]
            res_y = 8000
            n_x = len(pd.unique(_geodataset.loc[:, 'Xlamb']))
            n_y = len(pd.unique(_geodataset.loc[:, 'Ylamb']))
            x_min = min(_geodataset.loc[:, 'Xlamb']) - res_x/2 #NB : not 56000
            y_min = min(_geodataset.loc[:, 'Ylamb']) - res_y/2
            profile_base = {'driver': 'NetCDF',
                            'dtype': 'float64',
                            'nodata': None,
                            # 'time': n_dates,
                            'height': n_y, #23
                            'width': n_x, #37
                            'count': n_dates,
                            'crs': rio.crs.CRS.from_epsg(27572),
                            'transform': Affine(res_x, 0.0, x_min,  
                                                0.0, res_y, y_min),
                            # values deducted from Xlamb and Ylamb and QGIS
                            'tiled': False,
                            'interleave': 'band'}
            
            _geodataset['centers'] = [Point(x_y) for x_y in 
                                        zip(_geodataset.loc[:, 'Xlamb'], 
                                            _geodataset.loc[:, 'Ylamb'])]
            # NB: Coordinates are here in Lambert Zone II (epsg:27572)    
            
            raster_data_allyears = np.empty(shape = (n_dates,
                                                     profile_base['height'], 
                                                     profile_base['width']))
            
            print("  Rasterisation... ({} éléments. Peut prendre plusieurs minutes)".format(len(all_date_indexes)))
            for d, _date in enumerate(all_date_indexes):
                _raster_data = rasterio.features.rasterize(
                    [(_dataval.loc['centers'], _dataval.loc[pd.to_datetime(_date)]) 
                      for i, _dataval in _geodataset.iterrows()],
                    out_shape = (profile_base['height'], profile_base['width']),
                    transform = profile_base['transform'],
                    fill = np.NaN,
                    all_touched = True)
                
                raster_data_allyears[d, :, :] = _raster_data
                #*** Est-ce que cette étape peut être faite telle quelle avec 
                # des données incluant tous les temps ?***
                
            print("  xarray...")
            dataarray_allyears = xr.DataArray(
                raster_data_allyears, 
                #raster_data_allyears.astype('float32'), 
                coords = [all_date_indexes, 
                          np.sort(pd.unique(_geodataset.loc[:, 'Ylamb'])), 
                          np.sort(pd.unique(_geodataset.loc[:, 'Xlamb']))],
                # <!> NB <!> Il est extrêmement important de trier les
                # indices dans l'ordre croissant !!! (ils sont désordonnés dans 
                # _geodataset). Sinon QGIS n'arrive pas à lire les fichiers !
                dims = ['time', 'y', 'x']) 
# =============================================================================
#                 # [AVANT : dims = ['time', 'latitude', 'longitude'])]
# =============================================================================
                # L'ordre conseillé des dimensions est t, lat, lon, mais ça n'a
                # pas d'incidence.
            
            # Conversion en Dataset :
            dataset_allyears = dataarray_allyears.to_dataset(name = _variable)
            
            print("Exportation...")
            if output_format in ['NetCDF', 'nc']:
                #% Pour NetCDF :
                # - - - - - - - -        
                # Formatage des attributs selon les conventions QGIS :
                # Insert georeferencing metadata :
                dataset_allyears.rio.write_crs("epsg:27572", inplace = True)
                dataset_allyears.x.attrs = {'standard_name': 'projection_x_coordinate',
                                            'long_name': 'x coordinate of projection',
                                            'units': 'Meter'}
                dataset_allyears.y.attrs = {'standard_name': 'projection_y_coordinate',
                                            'long_name': 'y coordinate of projection',
                                            'units': 'Meter'}
# =============================================================================
#                 # AVANT :
#                 dataset_allyears.longitude.attrs = {'units': 'degrees_east',
#                                            'long_name': 'longitude'}
#                 dataset_allyears.latitude.attrs = {'units': 'degrees_north',
#                                            'long_name': 'latitude'}
# =============================================================================
                # dataset_allyears.time.attrs = {'units': 'days since 1970-01-01',
                #                       'calendar': 'gregorian',
                #                       'long_name': 'time'}
                dataset_allyears.attrs = {'Conventions': 'CF-1.6'}
                
                dataset_allyears.to_netcdf(output_file + ".nc")
                (folder_name, file_name) = os.path.split(output_file + ".nc")
                print("> Le fichier \'{}\' a été créé dans le dossier \'{}\'.".format(
                    file_name, folder_name))
            
            else:
                #% Pour TIFF :
                # - - - - - - -
                dataset_allyears[_variable].rio.to_raster(output_file + '.tiff')
    # =============================================================================
    #             # Méthode sans rioxarray :
    #             # (Problème : les valeurs des coordonnées semblent désordonnées)
    #             with rasterio.open(output_file + ".tiff", 'w', 
    #                                **profile_base) as out_f:
    #                 for d, _date in enumerate(all_date_indexes):
    #                     # out_f.write_band(d + 1, raster_data_allyears[d, :, :])
    #                     out_f.write_band(d + 1, 
    #                                      dataset_allyears[_variable].sel(time = _date))      
    # =============================================================================
                (folder_name, file_name) = os.path.split(output_file + ".tiff")
                print("> Le fichier \'{}\' a été créé dans le dossier \'{}\'.".format(
                    file_name, folder_name))
                    
    return dataset_allyears


###############################################################################
#%%% ERA5-Land
def correct_bias(input_file, correct_factor = 1, to_dailysum = True,
                 progressive = False):
    """
    Fonction plutôt à utiliser sur les données finales formatées
    correct_factor:
        - for precipitations (hourly) : 1.8
        - for precipitations (daily sum): 0.087 
        - for radiations (daily sum): 0.0715 (and progressive = True)
        - for potential evapotranspiration pos (daily sum): 0.04
    """
    
    print("\nLoading...")
    with xr.open_dataset(input_file, decode_coords = 'all') as ds:
        ds.load() # to unlock the resource
        
    ds2 = ds.copy(deep = True)
    
    print("\nCorrecting values: multiply by {}...".format(correct_factor))
    
    if not progressive: # constant correction for any day
        var_list = main_var(ds)
        suffix = f"{correct_factor}"
        for var in var_list:
            ds2[var] = ds2[var]*correct_factor
        correct_factor_max = correct_factor
    
    elif progressive: # varying correction along the year (stronger in summer, weaker in winter)
        var_list = main_var(ds)
        suffix = f"{correct_factor}P1"
        monthly_correction = np.array([0.74130111, 0.6586179, 1.04861236, 
                                       0.98615636, 0.96493336, 1.15048825, 
                                       1.06386533, 1.16570181, 1.001253, 
                                       0.81417812, 0.71620761, 0.76901861]) * correct_factor
        # Redefinition of correct_factor as a monthly correction:
        # correct_factor = ds2.mean(dim = ['latitude', 'longitude']).groupby("time.month").mean()
        # correct_factor[var] = monthly_correction
        correct_factor_monthly = xr.DataArray(monthly_correction, dims = 'month', coords = {'month': list(range(1,13))})
        for var in var_list:
            ds2[var] = ds2[var].groupby('time.month', squeeze = False)*correct_factor_monthly
        
        ds2 = ds2.drop("month")
        correct_factor_max = float(correct_factor_monthly.max())
        
# =============================================================================
#         # Pipeline pour calculer les coefficients
#         month_length = ds.time.dt.days_in_month
#         weights = (
#             month_length.groupby("time.month") / month_length.groupby("time.month").sum())
#         # Pour vérifier que la somme des pondérations fait bien 1 pour chaque mois
#         np.testing.assert_allclose(weights.groupby("time.month").sum().values, np.ones(12))
#         ds_weighted = (ds * weights).groupby("time.month").sum(dim="time")
#         ds_weighted.mean(dim = ['longitude', 'latitude'])
#         # Référence (visuelle, d'après https://loa.univ-lille.fr/documents/LOA/formation/stages/B_Anh.pdf)
#         [20, 30, 90, 125, 145, 170, 165, 155, 100, 50, 25, 20] # kWh/m²/month
#         [2322581, 3857143, 10451613, 15000000, 16838710, 20400000,
#          19161290, 18000000, 12000000, 5806452, 3000000, 2322581] # J/m²/d
# =============================================================================
        
    
    unit_factor = 1
    if to_dailysum: # Si les données d'entrée sont des moyennes en m/h :
        print("\nCorrecting units:")
        print("   input values are in .../h")
        print("   multiply by 24 --> .../d")
        for var in var_list:
            ds2[var] = ds2[var]*24
        unit_factor = 24
        
    
    print("\nCorrecting encodings...")
    for var in var_list:
        ds2[var].encoding = ds[var].encoding
        ds2[var].encoding['scale_factor'] = ds[var].encoding['scale_factor']*correct_factor_max*unit_factor
        ds2[var].encoding['add_offset'] = ds[var].encoding['add_offset']*correct_factor_max*unit_factor

    ds2.x.encoding['_FillValue'] = None
    ds2.y.encoding['_FillValue'] = None
    
    print("\nExporting...")
    output_file = os.path.splitext(input_file)[0] + f" x{suffix}.nc"
    ds2.to_netcdf(output_file)
    


###############################################################################
# Complete CWatM climatic variables
def secondary_era5_climvar(data):
    """
    This function computes the secondary data from ERA5-Land data, such as:
        - crop and water standard ETP from pan evaporation
        - wind speed from U- and V-components
        - relative humidity from T°, P and dewpoint

    Parameters
    ----------
    data : filepath or xarray.dataset (or xarray.dataarray)
        Main dataset used to generate secondary quantities.

    Returns
    -------
    None. Generates intended files.

    """
    
    # ---- Load main dataset
    data_ds = geo.load_any(data, decode_coords = 'all', decode_times = True)
    var = geo.main_var(data_ds)
    
    # ---- Corrections
    if var == 'pev': 
        ETref, EWref = compute_Erefs_from_Epan(data_ds)
        geo.export(ETref, ' ET0crop'.join(os.path.splitext(data_ds)))
        geo.export(EWref, ' EW0'.join(os.path.splitext(data_ds)))
    
    elif var in ['u10', 'v10']:
        if var == 'u10': 
            v10 = input("Filepath to V-component of wind")
            u10 = data_ds
            
        elif var == 'v10':
            u10 = input("Filepath to U-component of wind")
            v10 = data_ds
        
        wind_speed_ds = compute_wind_speed(u10, v10)
        geo.export(wind_speed_ds, ' wind_speed'.join(os.path.splitext(data_ds)))
    
    elif var == 'rh':
        dewpoint_file = input("Filepath to dewpoint data: ")
        temperature_file = input("Filepath to 2m mean temperature data: ")
        pressure_file = input("Filepath to surface pressurce data: ")
        rhs_ds = compute_relative_humidity(
            dewpoint_input_file = dewpoint_file, 
            temperature_input_file = temperature_file,
            pressure_input_file = pressure_file,
            method = "Sonntag"
            )
        geo.export(rhs_ds, ' Relative humidity'.join(os.path.splitext(data_ds)))


#%% main
if __name__ == "__main__":
    # Format the inputs (assumed to be strings) into floats
    sys.argv[1] = float(sys.argv[1])
    sys.argv[2] = float(sys.argv[2])