import snowflake.snowpark as snowpark from snowflake.snowpark.functions import col import logging import sys logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) # Importing all required modules from datetime import datetime, timedelta import dateutil.relativedelta import math import time import os import pandas as pd import json import numpy as np from statistics import mean import pandas as pd from datetime import date, timedelta from datetime import datetime, timedelta import traceback type_Code_mapping = None from datetime import datetime, timedelta import csv import time class Segment: """ Segment class to test all the rules for a loan id which will help in classifying it into one of the 8 segments defined by collections team for the soft calling """ def __init__(self, _case): self.case = _case[0] self.calculate_mob() self.calculate_bounce() self.calculate_payment() self.calculate_special() def calculate_mob(self): self.mob = self.case['mob'] return None def calculate_bounce(self): self.bounce_3M = self.case['bounce_3M'] self.bounce_6M = self.case['bounce_6M'] self.bounce_12M = self.case['bounce_12M'] return None def calculate_payment(self): self.avg_dpd_12M = self.case['avg_dpd_12M'] self.max_dpd_3M = self.case['max_dpd_3M'] return None def calculate_special(self): self.nache_registered = self.case['nache_registered'] self.gps_status = self.case['gps_status'] return None def segment_123_mob(self): if self.mob == 1: _segment = 'S2' elif self.mob in [2,3]: if self.nache_registered == 0: _segment = 'S6' elif self.bounce_3M == 0: _segment = 'S2' else: _segment = 'S6' else: _segment = None return _segment def segment_4_12_mob(self): if (self.bounce_6M >= 4): _segment = 'S7' elif (self.bounce_6M > 1) and (self.bounce_6M < 4): _segment = 'S5' elif (self.bounce_6M == 1) and (self.bounce_3M == 1): _segment = 'S4' elif (self.nache_registered == 0) and (self.avg_dpd_12M <15): _segment = 'S3' elif (self.bounce_6M == 1) and (self.bounce_3M == 0): _segment = 'S3' elif self.bounce_6M == 0: _segment = 'S1' else: _segment = None return _segment def segment_12_plus_mob(self): if (self.bounce_12M >= 6): _segment = 'S7' elif (self.bounce_12M in [3,4,5]): _segment = 'S5' elif (self.bounce_12M in [1,2]) and (self.bounce_3M in [1,2]): _segment = 'S4' elif (self.nache_registered == 0) and (self.avg_dpd_12M <15): _segment = 'S3' elif (self.bounce_12M in [1,2]) and (self.bounce_3M == 0): _segment = 'S3' elif self.bounce_12M == 0: _segment = 'S1' else: _segment = None return _segment def segment_special(self): if self.avg_dpd_12M >= 15: if (self.max_dpd_3M >= 1) or (self.gps_status != 'Active') or (self.nache_registered == 0): _segment = 'S7' else: _segment = None else: _segment = None return _segment def get_segment(self): if self.mob <= 3: _segment = self.segment_123_mob() elif self.mob >= 4: _segment = self.segment_special() if _segment is None: if (self.mob >= 4) and (self.mob <= 12): _segment = self.segment_4_12_mob() elif self.mob > 12: _segment = self.segment_12_plus_mob() else: _segment = None else: _segment = None return _segment def get_segment_final(self): _segment = self.get_segment() if (self.mob in [0,1,2,3]): _segment = 'S2' elif (_segment == 'S6') and (self.avg_dpd_12M <= 15): _segment = 'S2' elif (_segment == 'S7') and (self.avg_dpd_12M <= 15): _segment = 'S6' elif (_segment in ['S4','S5','S6','S7']) and (self.avg_dpd_12M > 15): _segment = 'S7' else: _segment = _segment return _segment def fetch_data(session): #### Segmentation data query segmentation_data = session.sql(f""" WITH start_date AS ( SELECT DATE_TRUNC('month', current_date()) AS start_date ), end_date AS ( SELECT last_day(current_date)+1 AS end_date ), BASE AS (SELECT LOAN_ID, PARTNER_LOAN_ID, DISBURSAL_DATE, FIRST_EMI_DUE_DATE, LOAN_STATUS, CURRENT_DPD_BKT AS LAST_MONTH_DPD_BKT, PRINCIPAL_OUTSTANDING AS OPENING_PRINCIPAL, LOAN_AMOUNT, (MOB+1) AS MOB FROM "PC_STITCH_DB"."NBFC_PROD"."LOAN_PERFORMANCE_DETAILS" WHERE PARTNER_NAME IN ('PFL') AND LOAN_BOOK_DATE = DATE_TRUNC('month', current_date())-1), NACH as (select LOAN_ID,PRIMARY_NACH_STATUS from PC_STITCH_DB.NBFC_PROD.NACH_DATA_FINAL_V1 qualify row_number() over (partition by LOAN_ID order by PRIMARY_NACH_ACTIVATION_DATE desc)=1), GPS AS (SELECT * FROM DEMO_DB.PUBLIC.GPS_DATA), BOUNCE_DATA AS (WITH historical_bounce_data AS (SELECT LOAN_ID, BOUNCE_FLAG, DATEDIFF(month, (select end_date from end_date), LOAN_BOOK_DATE) AS EV_MONTH FROM (SELECT * FROM "PC_STITCH_DB"."NBFC_DWH_PRE_PROD"."CONSOLIDATED_ALL_PARTNER_BOUNCE" WHERE PARTNER = 'PFL' AND LOAN_BOOK_DATE >= Dateadd(Month, -12, (select end_date from end_date)) AND LOAN_BOOK_DATE < (select end_date from end_date)) ) SELECT TRIM(LOAN_ID) AS LOAN_ID, //max(case when EV_MONTH >= -3 THEN current_dpd_bkt END) as plus30_ever, count(distinct case when EV_MONTH >= -3 and bounce_flag = 1 then EV_MONTH end) as bounce_3m, count(distinct case when EV_MONTH >= -6 and bounce_flag = 1 then EV_MONTH end) as bounce_6m, count(distinct case when EV_MONTH >= -12 and bounce_flag = 1 then EV_MONTH end) as bounce_12m FROM historical_bounce_data GROUP BY LOAN_ID), DPD_BKT AS (SELECT LOAN_ID, max(current_dpd_bkt) as max_dpd_3m FROM "PC_STITCH_DB"."NBFC_PROD"."LOAN_PERFORMANCE_DETAILS" WHERE PARTNER_NAME in ('PFL') AND DATEDIFF(month, (select end_date from end_date), LOAN_BOOK_DATE)>=-3 GROUP BY 1), CURRENT_DPD AS (SELECT PROPOSAL_NO,DPD,DATE_TRUNC('month', MIS_DATE) AS MIS_MONTH,INST_DUE_DATE from "PC_STITCH_DB"."NBFC_DWH_PRE_PROD"."PFL_DAILY_MIS" WHERE MIS_MONTH = (SELECT start_date FROM start_date) QUALIFY ROW_NUMBER() OVER (PARTITION BY PROPOSAL_NO ORDER BY MIS_DATE DESC,ETL_DATE DESC)=1) SELECT BASE.LOAN_ID, BASE.DISBURSAL_DATE, BASE.FIRST_EMI_DUE_DATE AS FEMI_DATE, BASE.LOAN_STATUS, CASE WHEN NACH.PRIMARY_NACH_STATUS IS NULL THEN 'NO' WHEN NACH.PRIMARY_NACH_STATUS IN ('ACTIVE','Active') THEN 'YES' ELSE 'NO' END AS NACHE_REGISTERED, CASE WHEN GPS.STATUS_FOR_SEGMENTATION IN ('Inactive') then 'Inactive' else 'Active' end AS FINAL_STATUS, GREATEST(DATEDIFF(month,FEMI_DATE,(select end_date from end_date)),0) AS MOB, COALESCE(BOUNCE_DATA.bounce_3m,0) AS BOUNCE_3M, COALESCE(BOUNCE_DATA.bounce_6m,0) AS BOUNCE_6M, COALESCE(BOUNCE_DATA.bounce_12m,0) AS BOUNCE_12M, CASE WHEN Z.DPD <= 0 THEN 0 WHEN Z.DPD > 0 AND Z.DPD <= 30 THEN 1 WHEN Z.DPD > 30 AND Z.DPD <= 60 THEN 2 WHEN Z.DPD > 60 AND Z.DPD <= 90 THEN 3 WHEN Z.DPD > 90 AND Z.DPD <= 120 THEN 4 WHEN Z.DPD > 120 AND Z.DPD <= 150 THEN 5 WHEN Z.DPD > 150 AND Z.DPD <= 180 THEN 6 WHEN Z.DPD > 180 AND Z.DPD <= 210 THEN 7 WHEN Z.DPD > 210 AND Z.DPD <= 240 THEN 8 WHEN Z.DPD > 240 AND Z.DPD <= 270 THEN 9 WHEN Z.DPD > 270 AND Z.DPD <= 300 THEN 10 WHEN Z.DPD > 300 AND Z.DPD <= 330 THEN 11 WHEN Z.DPD > 330 AND Z.DPD <= 360 THEN 12 WHEN Z.DPD > 360 THEN 13 END AS CURRENT_BUCKET, COALESCE(GREATEST(CURRENT_BUCKET,DPD_BKT.max_dpd_3m),0) AS MAX_DPD_BUCKET, CASE WHEN BOUNCE_3M = 0 THEN 0 WHEN AVG_DPD.DPD_DAYS IS NULL THEN 0 ELSE (AVG_DPD.DPD_DAYS/BOUNCE_3M) END AS AVG_DPD_12M FROM BASE LEFT JOIN NACH ON BASE.LOAN_ID = NACH.LOAN_ID LEFT JOIN GPS ON BASE.LOAN_ID = GPS.LOAN_ID LEFT JOIN BOUNCE_DATA ON BASE.LOAN_ID = BOUNCE_DATA.LOAN_ID LEFT JOIN DPD_BKT ON BASE.LOAN_ID = DPD_BKT.LOAN_ID LEFT JOIN CURRENT_DPD Z ON BASE.PARTNER_LOAN_ID = Z.PROPOSAL_NO LEFT JOIN DEMO_DB.PUBLIC.PFL_AVG_DPD_3M AVG_DPD ON BASE.LOAN_ID = AVG_DPD.LOAN_ID WHERE LOAN_STATUS = 'Active' """) # read sql query logging.info("Fetching data from segment query...") segmentation_data = segmentation_data.to_pandas() # segmentation_data.columns = segmentation_data.columns.str.lower() # remove null values logging.info(f"Running Segment Data query for {segmentation_data.columns}") segmentation_data = segmentation_data[segmentation_data['LOAN_ID'].notnull()] logging.info(f"Segmentation data is {segmentation_data}") return segmentation_data def df_processor(session, df): df = df[['LOAN_ID','MOB','CURRENT_BUCKET','BOUNCE_3M','BOUNCE_6M','BOUNCE_12M', 'AVG_DPD_12M','MAX_DPD_BUCKET','NACHE_REGISTERED','FINAL_STATUS']] df.columns = ['LOAN_ID', 'mob', 'current_bucket', 'bounce_3M', 'bounce_6M', 'bounce_12M', 'avg_dpd_12M', 'max_dpd_3M', 'nache_registered', 'gps_status' ] logging.info(f"Columnsss are :{df.columns}") # ## Corrections # nach_mapper = {'No': 0, 'Yes': 1} """ gps_mapper = {'other' : 'Active', 'expired' : 'Active', 'active' : 'Active', 'nan' : 'Active', 'none' : 'Inactive', 'inactive' : 'Inactive', 'not installed' : 'Active', 'no information': 'Active', 'uninstalled' : 'Inactive', } df['gps_status'] = df['gps_status'].map(lambda x: gps_mapper[str(x).lower()]) """ nach_mapper = {'NO': 0, 'YES': 1} df['nache_registered'] = df['nache_registered'].map(nach_mapper) ## Corrections logging.info(f"Processing df {df}") return df def segment_dict_extractor(session, df): """ Main Function to iterate over the rows in dataframe and run the segment class (above) which classifies the case in one of the S0-S7 segments """ segment_dict = {} for i, row in df.iterrows(): segment_calculator = Segment(df[df['LOAN_ID'] == row.get("LOAN_ID")].to_dict(orient='records')) segment_dict[row.get("LOAN_ID")] = segment_calculator.get_segment_final() df.loc[i, "Segment Names"] = segment_dict[row.get("LOAN_ID")] # df["LOAN_ID"].map(segment_dict) logging.info(f"Segment dict is {segment_dict} and df is {df.columns}") return df def main(session: snowpark.Session): df = fetch_data(session) df = df_processor(session, df) df = segment_dict_extractor(session, df) snowpark_df = session.create_dataframe(df) return snowpark_df