Kendall Kinnear System Analyst Standard Motor Products, Inc. (214) 843-0841 okkinnear+MMSA@gmail.com Looking for an easy way to monitor QHST message Remote journal outages occurring Messages only in QHST No monitoring software Started investigating options Purchase monitoring software ($$$$) Process QHST files with a program Find another way – Watches I’d never heard of these, turns out they are pretty cool Officially System Event Watches Implemented in V5R4 Types of events to monitor for Messages anywhere in the system Any job log History log Message queue (*MSGQ) Licensed Internal Code (LIC) Log Product Activity Log (V6.1 or higher) User written program called when event occurs Start Watch – STRWCH End Watch – ENDWCH Work with Watches – WRKWCH Unique across system *NONE required if watching PAL or LIC Message queue to watch: - *SYSOPR - *JOBLOG - *HSTLOG - Specific message queue Required if message queue is *JOBLOG User written program can Execute any command, APIs, etc. Execute any function accessible to a *USER state/*USER domain program Adopt owner authority if necessary Four parameters required Watch event (MSGID, LICLOG, PALLOG) Session identifier Error detection (output) Event specific data Watch Event Why the program was called Values: *MSGID - A match on a message id & comparison data *LICLOG - A match on a LIC log & comparison data *PAL - A match on a PAL & comparison data *STRWCH – Watch session is starting *ENDWCH – Watch session is ending Session ID The session ID specified or generated by STRWCH Error Detected Return code to indicate completion status Blank = successful *ERROR = Error occurred in watch program, watch is canceled Event Specific Data Data structure Details about event that triggered the watch Parse to retrieve information QSCWCHPS job is used to call the watch program If there is a problem with the watch program itself, check the joblog for the QSCWCHPS job WRKJOB QSCWCHPS will show all active and jobs in OUTQ status To isolate the job, try: Use CHGJOB LOG(4 00 *SECLVL) LOGCLPGM(*YES) to make sure that messages and CL commands are logged Use the DMPCLPGM to do a dump of the CL program including variables and error messages Include DLYJOB in program, then issue the WRKACTJOB SBS(QUSRWRK) and look for the QSCWCHPS job in DLYW Issue a reply message to put the job in MSGW and use WRKACTJOB SBS(QUSRWRK) and look for the QSCWCHPS job in MSGW Single watch program/more than one watch: If you access shared storage it could create contention If you access system objects, make sure any locks are released to avoid issues Too many watches can cause performance issues Requires *SERVICE special authority Server QSCWCHPS must be running in subsystem QUSRWRK STRPJ SBS(QUSRWRK) PGM(QSYS/QSCWCHPS) if not active Knowledge Center: http://www01.ibm.com/support/knowledgecenter/ssw_ibm_i_71/cl/strwch.htm?lang=en STRWCH - Watch Exit Programs Explained with CL Example http://www-01.ibm.com/support/docview.wss?uid=nas8N1011571 Other articles http://www.redbooks.ibm.com/abstracts/tips0839.html?Open#content s http://support.rjssoftware.com/content/using-event-watches-iseries Remote journals receive a TCP RESET command at random times Not issued by partner IBM i Something in network spoofs the IP address of partner Results in CPF70D5 in QHST for failed journal CPC6984 issued when journal restarted Issue Robot/ALERT for each event Let Network Support know RESET received Let me know if journal doesn’t restart Simple solution until cause of TCP RESET is found Input parameters MMXRJMON: PGM PARM(&WCHOPTN &SSNID &ERRCOD &EVTDATA) DCL VAR(&WCHOPTN) TYPE(*CHAR) + LEN(10) VAR(&SSNID) TYPE(*CHAR) + LEN(10) VAR(&ERRCOD) TYPE(*CHAR) + LEN(10) VAR(&EVTDATA) TYPE(*CHAR) + LEN(2048) DCL DCL DCL /* Reason Program called */ /* Session ID */ /* Return Error Code */ /* Event Data */ Exploded parameter variables /* Event data work area for use by *DEFINED variables */ DCL VAR(&EVTDTADFND) TYPE(*CHAR) + LEN(2048) DCL VAR(&MSGID) TYPE(*CHAR) + STG(*DEFINED) LEN(7) + DEFVAR(&EVTDTADFND 5) /* Watched Message */ DCL VAR(&OFSRPLDTA) TYPE(*INT) + STG(*DEFINED) LEN(4) + DEFVAR(&EVTDTADFND 441) /* Offset to Repl Data */ DCL VAR(&LENRPLDTA) TYPE(*INT) + STG(*DEFINED) LEN(4) + DEFVAR(&EVTDTADFND 445) /* Length of Repl Data */ DCL VAR(&PMSGRPLDTA) TYPE(*PTR) /* PTR to Repl Data */ DCL VAR(&BMSGRPLDTA) TYPE(*CHAR) + STG(*BASED) LEN(512) + BASPTR(&PMSGRPLDTA) /* Based Repl Data */ DCL VAR(&MSGRPLDTA) TYPE(*CHAR) + LEN(512) /* Msg Replacement Data */ Work variables /* Work variables DCL DCL DCL DCL DCL */ VAR(&JRNNAM) TYPE(*CHAR) LEN(10) VAR(&FSTLVL) TYPE(*CHAR) + LEN(132) VAR(&SCDLVL) TYPE(*CHAR) + LEN(132) VAR(&MSGDTA) TYPE(*CHAR) + LEN(132) VAR(&SYSNAME) TYPE(*CHAR) + LEN(10) /* Journal Name */ /* 1ST LEVEL TEXT */ /* 2ND LEVEL TEXT */ /* MESSAGE DATA */ /* System name */ Main logic /* Setup generic monitor to handle unexpected errors and alert support team MONMSG MSGID(CPF0000 LVE0000) EXEC(GOTO CMDLBL(GENERR)) */ /* Get system name for message handling control RTVNETA SYSNAME(&SYSNAME) */ /* COPY PARAMETER TO WORK DATA CHGVAR VAR(&EVTDTADFND) VALUE(&EVTDATA) /* Message replacement data */ CHGVAR VAR(&PMSGRPLDTA) + VALUE(%ADDR(&EVTDTADFND)) CHGVAR VAR(%OFS(&PMSGRPLDTA)) + VALUE(%OFS(&PMSGRPLDTA) + + &OFSRPLDTA) IF COND(&LENRPLDTA *GT 0) + THEN(DO) CHGVAR VAR(&MSGRPLDTA) + VALUE(%SST(&BMSGRPLDTA 1 + &LENRPLDTA)) ENDDO /* Set base pointer */ */ /* Point to start of data*/ /* If rpl data B01*/ /* Store rpl data /* END if rpl data 01*/ E01*/ Main logic /* Extract journal name from message replacement data CHGVAR VAR(&JRNNAM) VALUE(%SST(&MSGRPLDTA 1 10)) */ /* If CPF70D5 Remote Journal Failure IF COND(&MSGID = 'CPF70D5') + THEN(DO) */ CHGVAR /* If CPF70D5 B01*/ VAR(&FSTLVL) VALUE('Remote journal' *BCAT + &JRNNAM *TCAT ' failed and should + auto-restart, check journaling for + issues.') /* Build 1st level text01*/ CHGVAR VAR(&SCDLVL) VALUE('Message + CPF70D5 received for + remote journal failure. + Sign-on to system + and verify remote + remote journals.') /* Build 2nd level text01*/ CALLSUBR SUBR(MIMIXLOG) /* Send messages 01*/ GOTO CMDLBL(END) /* Exit program 01*/ ENDDO /* End msg CPF70D5 E01*/ Main logic /* If CPC6984 Remote Journal Started IF COND(&MSGID = 'CPC6984') + THEN(DO) CHGVAR CHGVAR CALLSUBR GOTO ENDDO /* Exit program GOTO */ /* If CPF70D5 B01*/ VAR(&FSTLVL) VALUE('Remote journal' *BCAT + &JRNNAM *TCAT ' started.') /* Build 1st level text01*/ level text01*/ VAR(&SCDLVL) VALUE('Message + CPC6984 received for + remote journal start.') /* Build 2nd level text01*/ SUBR(MIMIXLOG) /* Send messages 01*/ CMDLBL(END) /* Exit program 01*/ /* End msg CPF70D5 E01*/ CMDLBL(END) /* Exit program */ */ Error handling and send messages /* General Error Handler GENERR: CHGVAR VAR(&FSTLVL) VALUE('Unexpected + error occurred in MONRJCMN. + Monitor ended') /* Build Message CHGVAR VAR(&ERRCOD) + VALUE('*ERROR') /* Indicate error CALLSUBR SUBR(MIMIXLOG) /* Send messages */ /* MIMIX Log Messages Subroutine SUBR SUBR(MIMIXLOG) CHGVAR VAR(&MSGDTA) VALUE(&FSTLVL + *CAT &SCDLVL) MIMIX/ADDMSGLOGE MSGID(LVI0005) + MSGDTA(&MSGDTA) SEV(40) + PRD(*MIMIX) CALLSUBR SUBR(ROBOTMSG) ENDSUBR /* Start subroutine */ */ /* Build msg data */ /* Log message /* Page with ROBOT */ */ */ */ */ Error handling and send messages /* ROBOT Paging Subroutine SUBR SUBR(ROBOTMSG) IF COND(&SYSNAME = XXXXXXXXXX) + THEN(DO) /* RBTALRLIB/RBASNDMSG MSG(&FSTLVL) + TOPG(MIMIXINF) RSP(*NO) /* MONMSG MSGID(CPF0000 LVE0000) /* ENDDO /* ELSE CMD(DO) /* MIMIX/RUNCMD CMD(RBTALRLIB/RBASNDMSG + MSG(&FSTLVL) TOPG(MIMIXINF) + RSP(*NO)) PROTOCOL(*TCP) + HOST(XXXXXXXXXX) /* MONMSG MSGID(CPF0000 LVE0000) /* ENDDO ENDSUBR END: ENDPGM */ If on SMPLEW01 */ Send alert Ignore errors END if LEW01 If not SMPLEW01 */ */ */ */ Page via LEW01 Ignore errors */ */ Questions?