# 
# *****************************************************************
# *                                                               *
# *   Copyright 2002 Compaq Information Technologies Group, L.P.  *
# *                                                               *
# *   The software contained on this media  is  proprietary  to   *
# *   and  embodies  the  confidential  technology  of  Compaq    *
# *   Computer Corporation.  Possession, use,  duplication  or    *
# *   dissemination of the software and media is authorized only  *
# *   pursuant to a valid written license from Compaq Computer    *
# *   Corporation.                                                *
# *                                                               *
# *   RESTRICTED RIGHTS LEGEND   Use, duplication, or disclosure  *
# *   by the U.S. Government is subject to restrictions  as  set  *
# *   forth in Subparagraph (c)(1)(ii)  of  DFARS  252.227-7013,  *
# *   or  in  FAR 52.227-19, as applicable.                       *
# *                                                               *
# *****************************************************************
#
# HISTORY
# 
# @(#)$RCSfile: sysman_utils.tcl,v $ $Revision: 1.1.1.1 $ (DEC) $Date: 2003/01/23 18:36:31 $
#


###################################################################
#
# Module:
#     sysman_utils.tcl
# Abstract:
#     This module contains generic sysman shared procs that can be
#     used by SUIT, MCL, and stand-alone sysmansh scripts.
#
# Notes:
#      General comments on module.
#
###################################################################

############################################################################
#
# Class:
#	SM_debug
#
# Abstract:
#	Class definitions for implementing a simple "debug_puts"
#	capability. The caller's application init's the debug object
#	with the name of an environment variable that when set,
#	enables debugging messages. When it's set to a positive integer,
#	that integers is used as a "trace level" which has an application
#	specific interpretation
#
#
# Notes:
#
# Class Description:
#	The methods for this class are:
#	   - init    - initializes and creates the debug object
#	   - puts    - print the specified debug message if debugging
#		       is enabled
#	   - get     - returns the debug state - true or false
#	   - trace   - print the specified message if the trace level
#		       is appropriate
#
############################################################################

Class SM_debug

#
# Method:
#	SM_debug init - initializes the debugging environment. 
#
# Inputs:
#	env_var - name of an environment variable that when set, enbles
#	debugging messages for this app. The value of an 
#	environment variable is assumed to be 0 for no debugging (same
#	as an undefined envar) or a positive integer. The integer is used
#	as the "trace level"
# Outputs:
# 	None
# Returns:
#	true or false
# Notes:
#	See sm_debug and sm_trace
#
SM_debug instproc init { env_var } {

    global env

    $self set debug "false"	
    if {[set ev [array get env $env_var]] != {}} {
	$self set debug "true"
	set value [lindex $ev 1]
	if { [ctype digit $value] == 1 } {
	    $self set trace_level $value
	}
    } 

} ;# end SM_debug init


#
# Method:
#	SM_debug puts - prints debugging messages if the appropriate debugging
#	environment variable has been set.  
#
# Inputs:
#	msg - the debugging message to display
# Outputs:
# 	None
# Returns:
#	None
# Notes:
#	None
#
SM_debug instproc puts { msg } {

    if { [$self set debug] == "true" } {
	puts "$msg"
    }

} ;# end SM_debug puts

#
# Method:
#	SM_debug get - returns the current debug state
#
# Inputs:
#	None
# Outputs:
# 	None
# Returns:
#	true or false
# Notes:
#	None
#
SM_debug instproc get { } {

    if { [$self set debug] == "true" } {
	return "true"
    } else {
	return "false"
    }

} ;# end SM_debug get

#
# Method:
#	SM_debug trace - prints debugging messages if the appropriate trace
#	level is defined
#
# Inputs:
#	level - if the currently define tracelevel is <= level, then
#	        print msg
#	msg   - the debugging message to display
# Outputs:
# 	None
# Returns:
#	None
# Notes:
#	None
#
SM_debug instproc trace { level msg } {

    if {[$self set trace_level] >= $level } {
	puts "$msg"
    } 
    
} ;# end SM_debug trace






# Get IDs and arguements for the specified processes on the specified
#   cluster node.
# procArrayRef - the name of an array variable. The indices are names
#                of processes sought. Names must exactly match the 
#                the service name.  (See the cluster DRI for service names.)
#                The call will set the values to a list of ID-flag
#                pairs for all processes that match each specified name.
#                If none, the value will be set to null. The pairs will
#                contain the process ID and any flags, options or parameters
#                specified when launching the command.
# nodeName - name of the cluster node that is too be interrogated.  Defaults
#            to the current node.

proc sm_getProcIds {procArrayRef {nodeName ""}} {
    upvar $procArrayRef  procArray

    # foreach service sought...
    foreach service [array names procArray] {
	set procPairs ""

	# Don't report errors; the cluster people don't care.
	#   Just treat them the same as though the process isn't running.
	# ksmReadProcInfo is implemented in C in sysmansh
	if { ! [catch {ksmReadProcInfo processes $nodeName $service} msg]} {

	    # If successful, load pid and invocation pairs for each
	    #   running process.
	    # foreach invocation (process) of the service
	    foreach process [array names processes] {
		# clear array because "array set" is really an append
		catch "unset vals"
		array set vals $processes($process)
		# if process running, add its pid and command line
		if {[expr $vals(flags) & 0x0001] == 0} {
		    lappend procPairs [list $vals(pid) $vals(argv)]
		}
	    }

	} else {
	    # puts "ksmReadProcInfo error: $msg"
	}
	set procArray($service) $procPairs
    }

}


proc sm_JobPID { jobname {PSopts {-uroot}} {node {}} } {
##
## **NOTE** Deprecated.  Use sm_getProcIds instead.
##
##  arg1 = name of the process (job) 
##  arg2 = (optional) defaults to processes owned by root
##         can supply any "ps(1)" arguments.
##  arg3 = (optional) defaults to blank -- this is the node
##         name on which we need to perform the ps command. 
##
## Gets the job's PID from the ps output.  Returns the PID on success or
## -1 on failure.  
##

global SysmanFocusHost

set host [id host]
set host [lindex [split $host .] 0]

# if a node name is provided...ensure that it is not fully qualified.
if {[llength $node] != 0} {
        set node [lindex [split $node .] 0]
}

# Get a version of SysmanFocusHost that is not fully qualified so that we can
# compare it to $host. Don't want to change global SysmanFocusHost itself.

set FocusHost $SysmanFocusHost          ;# Incidentally, could be null

# if $SysmanFocusHost is not null, extract the unqualified part.
if {[llength $SysmanFocusHost] != 0} {
    set FocusHost [lindex [split $SysmanFocusHost .] 0]
}

# if we need to look at a particular node, either focused or a cluster member, 
# we need to check to see if we are calling the local host.  If not, then
# use rsh, otherwise, just look locally.
if {[llength $node] != 0  &&  [cequal $node $host] != 1} {
    if { ![cluster member status $node] } {
	return -1
    }
    set node_intercnct [cluster member interconnect $node]
    set fd [open "|rsh -n $node_intercnct /sbin/ps -e -o pid,command $PSopts" r ]
} elseif {[llength $FocusHost] != 0 && 
          [cequal $FocusHost $host] != 1} {
    if { ![cluster member status $FocusHost] } {
	return -1
    }
    set focus_intercnct [cluster member interconnect $SysmanFocusHost]
    set fd [open "|rsh -n $focus_intercnct /sbin/ps -e -o pid,command $PSopts" r ]
} else {
    set fd [ open "|/sbin/ps -e -o pid,command $PSopts" r ]
}

  set retval -1
  while { [gets $fd line] >= 0 } {
    regsub -all "\[ \t\]+" $line " " line
    regsub -all "^\ " $line "" line
    set lline [split  $line]

	 # Need to check the second and third items in the list to see if
	 # either one of them matches the jobname.  This is necessary 
	 # because if a process is invoked by a shell script then the name
	 # of the shell will appear in the second list item and the name of 
	 # the process will appear in the third item.  If the process is not
	 # invoked via a shell, then the process name will be the second item
	 # in the list.
	 # 
	 set list {1 2} ;# Note the first item in the list starts at index 0
	 foreach i $list { 
    	if { [lindex $lline $i ] == $jobname } {
      	set retval [lindex $lline 0]	;# return jobs PID 
    	} elseif { [file tail [lindex $lline $i ]] == $jobname } {
      	set retval [lindex $lline 0]
    	}
	 } 
  }
  close $fd
  return $retval
}


proc sm_JobP { jobname {PSopts {-uroot}} } {
##
##  arg1 = name of the process (job) 
##  arg2 = (optional) defaults to processes owned by root
##         can supply any "ps(1)" arguments.
##
## Returns 1 if jobname is currently running on the system; 0 otherwise.
##

  return [expr [sm_JobPID $jobname $PSopts ] > 0 ? 1 : 0 ]
}

proc sm_PSMPID { categoryname {node {}} } {
##
##  arg1 = name of the category 
##  arg2 = (optional) defaults to blank -- this is the node
##         name on which we need to perform the ps command. 
##
## Gets the job's PID from the sm_getProcIds call.  Returns 
## the PID on success or -1 on failure.  
##
     	global SysmanFocusHost SysmanOnCluster

	# if $SysmanFocusHost is not null, extract the unqualified part.
	if { $SysmanOnCluster } {
	
	    # if a node name is provided, ensure that it is not fully qualified
	    # if a node name was not provided, then lets look
	    # to see whether we are focused at a node.
	    if {[llength $node] != 0} {
	        set node [lindex [split $node .] 0]
	    } else {
		if {[llength $SysmanFocusHost] != 0} {
			set node [lindex [split $SysmanFocusHost .] 0]
           	}
           }

	   # next find out if the node we want to talk to is up,
	   # If no node name has been provided, and we are not 
	   # focused, as checked above, then, we are looking
	   # at ourself, and are certainly up and running.
	   if {[llength $node] != 0} {

		if { ![cluster member status $node] } {
			return -1
		}

	   }
		
        }

	if {[info exists proc_array]} {
    		unset proc_array
	}

	set proc_array($categoryname) ""

	sm_getProcIds proc_array $node

	set return $proc_array($categoryname)
	if {[cequal $return ""]} {
		set retval -1
	} else {
		# get first pair in list, and then get first value in pair
		set pair [lindex $return 0]
		set retval [lindex $pair 0]	
	}

  	return $retval
}


proc sm_PSMP { categoryname } {
##
##  arg1 = name of the category 
##
## Returns 1 if a process is currently running on the system; 0 otherwise.
##

  return [expr [sm_PSMPID $categoryname] > 0 ? 1 : 0 ]
}

proc sm_PSMARGS { categoryname {node {}} } {
##
##  arg1 = name of the category 
##  arg2 = (optional) defaults to blank -- this is the node
##         name on which we need to perform the ps command. 
##
## Gets the job's Argument from the sm_getProcIds call.  Returns 
## the PID on success or -1 on failure.  
##
     	global SysmanFocusHost SysmanOnCluster

        # if $SysmanFocusHost is not null, extract the unqualified part.
        if { $SysmanOnCluster } {

            # if a node name is provided, ensure that it is not fully qualified
            # if a node name was not provided, then lets look
            # to see whether we are focused at a node.
            if {[llength $node] != 0} {
                set node [lindex [split $node .] 0]
            } else {
                if {[llength $SysmanFocusHost] != 0} {
                        set node [lindex [split $SysmanFocusHost .] 0]
                }
           }

           # next find out if the node we want to talk to is up,
           # If no node name has been provided, and we are not
           # focused, as checked above, then, we are looking
           # at ourself, and are certainly up and running.
           if {[llength $node] != 0} {

		if { ![cluster member status $node] } {
			return -1
		}

           }
       
        }

	if {[info exists proc_array]} {
    		unset proc_array
	}

	set proc_array($categoryname) ""

	sm_getProcIds proc_array $node

	set return $proc_array($categoryname)
	if {[cequal $return ""]} {
		set retval -1
	} else {
		# get first pair in list, and then get second value in pair
		set pair [lindex $return 0]
		set retval [lindex $pair 1]	
	}

  	return $retval
}

proc sm_GetID {} {

    # this procedure will return the name of the server that the
    # application is being run for.
    # 1. Returns SysmanFocusHost if the host is focused.
    # 2. Returns the hostname if SysmanFocusHost is {} and SysmanOnCluster is 0.
    # 3. Returns the cluster alias if SysmanOnCluster is 1 and we are not
    #    focused.

    global SysmanFocusHost SysmanOnCluster

    if { $SysmanOnCluster } {
	if {$SysmanFocusHost == {} } {
	    return [cluster alias]
	} else {
	    return $SysmanFocusHost
	}
    } else {
	return [id host]
    }
}


proc sm_clusterExec {memberList cmd} {
    # A wrapper for rsh that takes a list of members and a command
    # to run on each member.
    # Before rsh'ing the command, each member's status will be 
    # checked to determine if the node is up.  This should prevent 
    # hangs.  
    # Returns a list of of the form {memb1 output1 memb2 output2 ...}

    set result {}
    foreach memb $memberList {
	if [cluster member status $memb] {
	    catch {exec rsh -n [cluster member interconnect $memb] $cmd} msg
	    lappend result $memb $msg
	} else {
	    lappend result $memb {}
	}
    }
    return $result
}


############
#  This procedure is a wrapper for /usr/sbin/rcmgr.  Use it
#  instead of execing rcmgr yourself.  If there are performance
#  problems, we will change this procedure to cache the
#  rc.config databases, saving you the trouble.
#
#  Arguments: takes the same arguments with the same semantics 
#	      as rcmgr(8).  This will not change even if we
#	      make changes to this procedure.
#
proc rcmgr {args} {
    eval exec /usr/sbin/rcmgr $args
}

#
# Procedure: 
#     sm_exec_cmd - This procedure is a wrapper for sm_rsh.
#     sm_rsh is broken in that it doesn't return a status code. Instead
#     it forces all callers to parse the returned string and extract
#     a status code. This routine simply returns the status code
#     and the string value is passed by reference. 
#
#     sm_exec_cmd - This procedure makes it easy to run a command either
#     on the localhost or on a specific host. The idea is to simplify
#     the code paths for cluster management. Instead of:
#         if {I'm on a cluster and SysmanFocusHost != "" } {
#	      sm_rsh command
#	  } else {
#	      catch exec of command on localhost
#	  }  
#      In other words, having these two code paths for every focus operation
#      is bad. Instead use a single sm_exec_cmd call. If the hostname
#      is the same as the local host, then now rsh will be performed. 
#
#
# Inputs: 
#       hostname - host to execute the command on
#       args     - The command to execute and it's arguments. 
# Outputs: 
#       outstr   - output of the executed command. 
# Returns:
#       exit code of the command
# Notes:
#	In theory, this proc could be used for arbitrary remote management
#	operations, not just those in a cluster. However, this assumes that
#	rsh or ssh is set up properly. 
#

proc sm_exec_cmd {hostname outstr args} {

    upvar $outstr str

    # tcl tends to add a pair of curly bracket around the
    # arguments being passed in. When it gets passed down to
    # the next proc, it will be 2 pairs, which will be a 
    # problem when trying to execute the arguments.
    # So, get rid of the curly bracket here...

    set args [string range $args 1 end-1]

    # moved QAR fix 92116 from sm_rsh to this new procedure,
    # which will be called instead of sm_rsh from netadapt.mcl.
    # so that this proc can be used to execute command on
    # both local and remote system.

    set localexec 0

    # check if hostname is empty, if yes, assume local host
    if { [cequal [string trim $hostname] ""] } {
	set localexec 1
    } else {
	# get qualified hostname
	catch { set hostname [host_info official_name $hostname] }
	catch { set localhost [host_info official_name [id host]] }
	set localexec [cequal $hostname $localhost]
    }

    if { $localexec } {

	# execute on local node
	set exit_status  [catch {eval [concat exec $args]} str]

    } else {

	# execute on remote node
	set status [sm_rsh $hostname $args]
    
	# sm_rsh returns "<exit status>|<output string>" so parse off the
	# status and the output (if any). Note that you cannot use list
	# commands to do this parsing since the output string is arbitrary
	# (i.e. might look like a malformed tcl list). Use string
	# functions instead
	set i [string first "|" $status]    ;# Locate the "|" delimiter

	if { $i == -1 } {			;# If "|" not found
	    # This is meant to be a developer visible error return only
	    # so no need to i18n it. 
	    set str "Error: invalid return value from sm_rsh. '|' missing."
	    return 1
	}

	if { $i == 0 } {			;# If "|" is the first character
	    # Then the return code is missing!
	    set str \
	    "Error: missing return code from sm_rsh. The return value is:\n$status"
	    return 1
	}

	set exit_status [string range $status 0 [expr $i - 1] ]
	set str	    [string range $status [expr $i +1] end]

    }

    return $exit_status

} ;# end sm_exec_cmd

#
# Procedure: 
#     sm_rsh - This procedure is a wrapper for /usr/bin/rsh.
#     The proc will capture the exitcode and exit text, if any,
#     of the command that is rsh'd.  It will return exitcode|exit_text
#     to the caller.
# Inputs: 
#       hostname - host to rsh to.
#       args - arguments passed to rsh after the hostname parameter.
# Outputs: 
#       None
# Returns:
#       return_text - exitcode|exit_text of what was rsh'd on other host.
# Notes:
#

proc sm_rsh {hostname args} {

    # check if hostname exists on cluster, if so, return 1|error_text
    if { [catch { set rc [cluster member status $hostname] } msg] != 0} {
	# hostname not in cluster
	set return_text [format "1|%s" $msg]
	return $return_text
    }
    
    # hostname is in cluster but is down
    if { $rc == 0 } {
	set return_text [format "1|$hostname is down"]
	return $return_text
    }

    # if the interconnect exists, set hostname to that
    # otherwise, hostname will be what was passed into this proc
    if { [catch {set rc [cluster member interconnect $hostname]} msg] == 0} {
	set hostname $rc
    }

  #  set catch_val [catch {exec /usr/bin/rsh -n $hostname "$args; echo @$?"} \
  #	    ret_val]

	# Making the following command more shell independent
        # This is spawning ksh on the remote host and executing
        # commands in ksh. $? only works in sh and ksh and would
 	# fail if user invokes apps from csh or any other shells
	# Attching @@ before and after the return code as a marker

	# joining the args to take care of quoted args. If the join
	# is not put the quoted args get enclosed by braces.
	set catch_val [catch {exec echo "[join $args]\; echo @@\$\?@@" | rsh $hostname /bin/ksh -c} \
                ret_val]

     # If what was rsh'd executed correctly, the catch_val will be 0.
   	# The command that was rsh'd returned an error, capture its
	# exitcode and error text.
	# Or rsh didn't execute for some reason.
	set split_list [split $ret_val \n]
	
        # Searching for the marker pattern. The return code is the
        # embedded within the marker pattern

        set indx [lsearch -regexp $split_list "^@@\[0-9\]+@@$"]
        set exitcode [lindex $split_list $indx]
        set temp_list ""

	# Removing the list element containing the marker and 
	# then joining the string with list elements before
	# and after the marker element.

        set temp_list [join [lrange $split_list 0 [expr $indx - 1]] \n]
        append exit_text ""
        append exit_text $temp_list " " [join [lrange $split_list [expr $indx + 1] end] \n]

        if {$exitcode != ""} {
	
		# Extracting the return code from the marke list element 
		# using regexp. code contains the real exit code

            if {[regexp {@@([0-9]+)@@} $exitcode tmp code]} {
                set exitcode $code
                append return_text [format "%s|%s" $exitcode $exit_text]
             } else {
                # Wrong format of return code or no return code at all
		# Actually this piece of code should never be executed
                append return_text [format "|%s" $exit_text]
            }
        } else {
             append return_text [format "%s|%s" $exitcode $ret_val]
        }
   
    set return_text [string trim $return_text]    
    return $return_text
}

#
# This procedure is a wrapper for caa_stat calls to determine whether a 
# CAA managed application is running or not.  The caa_stat call returns
# a 0 if registered or running, and a 1 if not registered or running.
# This proc will return 1, true, if running and 0, false, if not running.
#
proc caa_is_running {applname} {

	# determine if applname is registered - 0 means success, or yes.
	set rc [catch {exec /usr/bin/caa_stat -a $applname -g} msg]
	if {$rc == 0} {

		# see if applname is running - 0 means success, or yes.
		set rc [catch {exec /usr/bin/caa_stat -a $applname -r} msg]
		if {$rc == 0} {
                       return 1
                }
        }
 
	return 0
}	

#
# Procedure: 
#     sm_GetTitleBarText - This procedure is a wrapper that returns 
#     the title bar text for a given application name.  We return one 
#     of the following text:
#       <App name> on <hostname> managing <cluster>,
#       <App name> on <hostname> managing <focus hostname>, or
#       <App name> on <hostname>
# 
# Inputs: 
#       appname - application name
# Outputs: 
#       None
# Returns:
#       title_text - title bar text to be displayed on application window.
# Notes:
#

proc sm_GetTitleBarText {appname} {
    global SysmanFocusHost SysmanOnCluster I18N_catobj

    # This check determines if the object has been created yet cause it's
    # implemented as a command.  Not pretty, but it works.
    if {[cequal [info command I18N_catobj] ""]} {
	_Catalog I18N_catobj
	I18N_catobj catopen i18n_motif_shared_text
    }

    # get the hostname and cluster alias
    set hostname [exec /sbin/hostname]
    set cluster_alias [cluster alias]

    if {$SysmanOnCluster} {
	if {$SysmanFocusHost != ""} {
	    # we're on a cluster focused to a node.
	    if { ![cequal $SysmanFocusHost $hostname] } {
		# set title to <app> on <host> managing <focus host>
		set title_text [format "[I18N_catobj catgets \
			i18n_on_cluster_txt]" $appname $hostname \
			$SysmanFocusHost]
	    } else {
		# set title to <app> on <host>
		set title_text [format "[I18N_catobj catgets \
			i18n_on_host_txt]" $appname $hostname]
	    }
	} else {
	    # we are focused to the cluster alias.  set the title to
	    # <app> on <host> managing <cluster alias>
	    set title_text [format "[I18N_catobj catgets \
		    i18n_on_cluster_txt]" $appname $hostname \
		    $cluster_alias]
	}
    } else {
	# we're on a single system, set title to <app> on <host>
	set title_text [format "[I18N_catobj catgets \
		i18n_on_host_txt]" $appname $hostname]
    }
    
    return $title_text
}

#
# Procedure: 
#     prll_rsh - parallel rsh. This routine replaces sm_rsh. It issues
#     a shell command across a set of hosts returning the exit status, 
#     stdout and stderr. The now obsolete sm_rsh did this serially, issuing
#     an rsh and waiting for it to return. prll_rsh uses the bgexec
#     background exec) tcl extension to launch the rsh's in parallel. 
# 
# Inputs: 
#       node_list - list of hosts to execute the command on. The default
#		    is a cluster or subset of members of the cluster
#	time_out  - time to wait (in milliseconds) before declaring the 
#		    rsh "hung" and returning an error status for that host. 
#	cl_int	  - if true, all nodes in node_list are cluster members and
#		    the cluster interconnect will be used for all rsh's. 
#	cmd_name  - name of the command to execute
#	args	  - the arguments for the command (if any). 
#		    
# Outputs: 
#       None
# Returns:
#	return value is a keyedlist with 3 keys of the form node-status, 
#	node-output, node-error for each node in the node_list. 
#	node-status will contain the exit status of the command
#	node-output will contain the stdout of the command (if any)
#	node-error will contain the stderr of the command (if any)
#       
# Notes:
#
#	Since rsh is used to execute <cmd_name> on remote hosts, 
#	there are several annoying restrictions:
#	    1) Rsh doesn't separate a commands stdout and stderr. It
#	       opens, two sockets, one for each, and displays the output
#	       as it's available. This causes stdout and stderror output
#	       to be combined/mixed together. IOW, the output from
#	       rsh cmd and the same command executed locally can be different.
#	       We could compensate for this by redirecting all the output
#	       to stderror or stdout at the expense of distinguishing 
#	       between the two. We don't do that currently.
#	    2) rsh does not return the exit status of the command it
#	       executes. Instead, it returns it's own status - whether
#	       it was able to successfully connect to rshd on the remote
#	       system and execute commands. In order to get back the real,
#	       remote status, we append echo $? to the command. That
#	       puts the exit status in the command output and we parse
#	       it out. Because of 1) above, it's a bit trickier than
#	       just reading the last line of output. See the code. 
#
#	A global array sm_prll_arr is used in this proc. 
#	This array is indexed by node,status, node,output
#	and node,error as the variable names. The
#	array has variable, value pairs.
#	Output, status and error are got from the 
#	bgexec. The output, status and error from 
#	bgexec are further parsed to get the correct 
#	output, error and status for each command 
#	executed on each node. 
#
#	This array holds one other variable node,result 
#	which is used to hold the exit status of bgexec 
#	command 
#
# Restrictions:
#	Currently, there is an artifical restriction that all the nodes
#	must be in the same domain. This is because unqualified node names
#	are used in resturn stati, keyed list indexes, etc. 
#	[pjw: easy bug to fix however] 
#
proc prll_rsh { node_list time_out cl_inct cmd_name args } {


    set local_host [lindex [split [id host] .] 0]
   
    # Global array which holds the values of 
    #  status, output and error for
    # a particular node for a particular command
    # Syntax of the index would be arr($hostname,status),
    # arr($hostname,output)
    # and arr($hostname,error)
    
    global sm_prll_arr
   
    # Unsetting global array before starting
    if {[info exists sm_prll_arr]} {
	unset sm_prll_arr
    }
   
    set real_list ""
    foreach node $node_list {
	lappend real_list [lindex [split $node .] 0]
    }
   
    foreach element $real_list { 
	# Check for cluster membership only when 
	# user sets the cl_inct to true

	if {[cequal $cl_inct "true"]} {
	    if { [catch { set rc [cluster member status \
		$element]} msg] != 0} {
	 
	    # hostname not in cluster so just let the user 
	    #know this fact
	    # no error messages..
		keylset retval $element-status 1
		keylset retval $element-output ""
		keylset retval $element-error "Node $element \
                              not in cluster"
	    
	    # Don't want any further processing for this 
	    # node
		continue
	    }

	    # hostname is in cluster but is down
	    # So let the user know of this fact
	    if { $rc == 0 } {
		keylset retval $element-status 1
		keylset retval $element-output ""
		keylset retval $element-error "Node $element is down"
		continue
	    }
	    # Cluster membership check ends 
	}

	# Initializing the global array variables
	 set sm_prll_arr($element,status) ""
	 set sm_prll_arr($element,output) ""
	 set sm_prll_arr($element,error) ""
	
	# Parsing the hostname and extracting the unqualified name
	# !! This wouldn't work if the list nodes are in different
	# domains
	# !! for now all the nodes in the cluster belong 
	# to a single domain

	# If the cl_inc is true then use cluster interconnect
	# else use the name of the node for rsh 
	if {[cequal $cl_inct "true"]} {
	    set node_intercnct [cluster member interconnect \
			$element]
	} else {
	    set node_intercnct $element
	}

	# Checking to see if running on a local node 
	# and executing locally if
	
	if {[info exists sm_prll_arr($element,result)]} {
	    unset sm_prll_arr($element,result)
	}
	
	# Note:- If the command is invalid command then there are 
	# 2 sceanrios. On the local node bgexec will complain and 
	# exit out without doing anything. In this case we can 
	# check the exit code of bgexec and do info exists, if not
	# then the command didn't execute and hence no point 
	# waiting for the status
	# since this is never going to happen.

	# In the remote node case, even though the command 
	# is invalid, 
	# since rsh is valid
	# we are going to get the status, output and error, 
	# so in this case we can safely wait
	# on the status variable. Also I am embedding the 
	# return value between two @@. Rsh has two independent
	# sockets for collecting
	# stderr and stdout. And rsh reports the stderr/stderr
	# in a single return value. Since stderr and stdout 
	# are on different sockets
	# the order in which rsh collects the data will
	# be different.

	if {![cequal $element $local_host]} {	;# If not the local host

	

	    # NOTE:- The weird "echo "command" | rsh ksh -c syntax 
	    # is intentional. Since cmd_name and args can contain
	    # completely arbitrary text, such as quotes, curlies, 
	    # semicolons, etc., it is critical that we prevent tcl 
	    # from eval'ing these arguments or otherwise trying 
	    # to parse them. The more natual 
	    #      rsh ksh -c "$cmd_name $args\;echo |$?"
	    # flat out does not work. No combination of curlies, 
	    # or quoting can make it work. 

	    set sm_prll_arr($element,result) \
	    [bgexec sm_prll_arr($element,status) \
		-output sm_prll_arr($element,output) \
		-error sm_prll_arr($element,error) \
		echo "$cmd_name $args\; echo @@\$?@@" | \
		rsh  $node_intercnct ksh -c &]

	} else {	;# on the local node. No rsh needed

	    # Note: in this local node case, bgexec acts differently
	    # in the case of an illegal command. It simply doesn't
	    # set any of the it's output variables at all. Catch this 
	    # and detect the missing result below
	     catch {set sm_prll_arr($element,result) \
		[bgexec sm_prll_arr($element,status) \
		    -output sm_prll_arr($element,output)\
		    -error sm_prll_arr($element,error) \
		    echo "$cmd_name $args" | ksh -c &] } msg

	}
    }

    # The reason for the following piece of code is because in 
    # some cases the status  might never be updated.. 
    # In such cases the vwait will have to forcibly time out vwait
    # The way to do this is to update the status variable with 
    # some value. Here after_script is being called

    # If the command is long-lived or hangs, we have a built in 
    # timer to cancel it. This is user customizable. When time_out is 
    # "none" we wait forever. Otherwise an after script kicks in to 
    # nuke the stuck bgexec's. 
    if {![cequal $time_out "none"]} {
	set after_id [after $time_out sm_after_script]
    }
   
    # If here, all the commands have been bgexec'd and we
    # we need to wait for them to complete. bgexed updates its 
    # status when the command completes so we vwait on that

    foreach element $real_list {			;# For each node

	# If the interconnect is set and ((node not in cluster)||
	# (node down) then continue... we don't have to do
	# any more checks for this node. rc is set from the
	# previous loop. We can only check for the
	# status of cluster members.. If not cluster members
	# then we cannot check if they are alive or dead.
	# If the non cluster node is dead, this proc 
	# will timoeout after the time_out interval is reached 

	if {[cequal $cl_inct "true"]} {
	    if { [catch { set rc [cluster member status \
		$element]} msg] != 0} {
		# Node name not in cluster
		continue
	    }
	     # hostname is in cluster but is down
	    # So just continue in this scenario also
	    if { $rc == 0 } {
		continue
	    }
	}
	

	if {[lempty $sm_prll_arr($element,status)]} {
		
	    # If this is a local node and command did not exist, 
	    # the the bgexec return value, sm_prll_arr($element,result),
	    # which usually holds the pid of the forked process, would
	    # be empty. 
	    
	    
	    if [info exists sm_prll_arr($element,result)] {  ;# If got a bgexec pid
		# Wait patiently for the command to complete. IOW, vwait
		# blocks waiting for the variable to change
		vwait sm_prll_arr($element,status)
		

		# If we reach here, the vwait for this node has completed
		# Now process check the return codes

		# We had embedded the exit code in between two @@.
		# Now we have to extract the real exit code of the
		# command executed remotely

		if {![cequal $element $local_host]} {	;# If remote node

		    # This is the hard case. We appended:
		    #	echo @@$?@@ to the rsh'd command so 
		    # we'd get the commands exit status (which 
		    # rsh doesn't give us). We now have to parse
		    # this out of the output stream. The hard work
		    # is done in extract_exitcode
		    # Note that bgexec returns a list as its status. It's
		    # in the form:
		    #	EXITED <pid > <exit status> "child completed normally"
		    # where pid of the forked bgexec process. 
		    # We return just the process's exit status to the caller
		    set temp_out [lindex [array get sm_prll_arr \
		         $element,output] 1]
		    set real_status [lindex [extract_exitcode \
				$temp_out] 0]
		    keylset retval $element-status $real_status
		    keylset retval $element-output \
			[lindex [extract_exitcode $temp_out] 1]
		
		} else {	;# local node case
		
		    # This is the easy case cause we have the return status
		    # There is not need to parse the output for it. 
        	    keylset retval $element-status \
			[lindex [lindex [array get \
			sm_prll_arr $element,status] 1] 2]
		    
		    keylset retval $element-output [lindex \
		     [array get sm_prll_arr $element,output] 1]

		}  ;# end if remote host

		# Fill in the stderr output of the command
		keylset retval $element-error [lindex \
		    [array get sm_prll_arr $element,error] 1]


	    } else {		;# bgexec failed - no pid
	

		# This is the case in which the command was invalid and hence
		# it cannot execute. This happens only for the local node.
		# This bgexec exits before executing the command as the shell
		# thinks this is an invalid command
		keylset retval $element-status 1
		keylset retval $element-output ""
		keylset retval $element-error "Error:\
                can't execute \"$cmd_name\""
	    } ;# end if got a bgexec pid

	} else {	;# status already filled in - bgexec completed

	    # Real fast execution cases --> No vwait in this case The case when
	    # the status has been updated and didn't have vwait. Output can be
	    # from local or remote node.  If the output is from remote node we
	    # need to extract the real output and real exitcode.

	    if {![cequal $element $local_host]} {	;# If remote node
		set temp_out [lindex [array get sm_prll_arr \
		         $element,output] 1]
		
		set real_status [lindex [extract_exitcode $temp_out] 0]
		keylset retval $element-status $real_status
		keylset retval $element-output [lindex \
				[extract_exitcode $temp_out] 1]
		 
	    } else {					;# local node

		keylset retval $element-status [lindex \
		    [lindex [array get sm_prll_arr $element,status] 1] 2]
		keylset retval $element-output [lindex \
		    [array get sm_prll_arr $element,output] 1]
	    }

	    # Same error processing for local and remote
            keylset retval $element-error [lindex \
		[array get sm_prll_arr $element,error] 1]

	} ;# end if bgexec status is ""

    } ;# end for each node
    

    # We have now vwaited for every node. If we are here then 
    # everything finished normally or we exceeded the timeout and 
    # the after proc has triggered. The after proc sets all the bgexec status
    # variables which tells bgexec to kill the process and forces
    # the vwait to complete. If we are still within the timeout period
    # then we need to cancel the vwait. Cancelling an already completed
    # after is OK. 
    if {![cequal $time_out "none"]} {
	after cancel $after_id
    }


    return $retval		;# Return the keyed list

} ;# end prll_rsh

proc extract_exitcode { output_list } {
# This proc will take the output_list, extract
# the exit code of the command executed on the remote
# note and return another list in a format
# {{return code} {real output}}. This is a helper
# proc for prll_rsh and is invoked from
# within prll_rsh

	set split_list [split $output_list \n]
        # Searching for the marker pattern. The return code is the
        # embedded within the marker pattern

        set indx [lsearch -regexp $split_list "^@@\[0-9\]+@@$"]
        set exitcode [lindex $split_list $indx]
        set temp_list ""

        # Removing the list element containing the marker and
        # then joining the string with list elements before
        # and after the marker element.

        set temp_list [join [lrange $split_list 0 \
		[expr $indx - 1]] \n]
        append exit_text ""
        append exit_text $temp_list " " [join [lrange \
			$split_list [expr $indx + 1] end] \n]

        if {$exitcode != ""} {
       
                # Extracting the return code from the 
	        # marke list element
                # using regexp. code contains the real exit code

            if {[regexp {@@([0-9]+)@@} $exitcode tmp code]} {
                set exitcode $code
                set return_text [list $exitcode $exit_text]
             } else {
                # Wrong format of return code or no 
		# return code at all
                # Actually this piece of code should never 
		# be executed
                set return_text [list " " $exit_text]
            }
        } else {
             set return_text [list  $exitcode $output_list]
        }
     return $return_text
}


proc sm_after_script {} {

# This is helper script for proc prll_rsh
# This is the script which will be invoked
# once the timeout interval is over for
# the prll_rsh proc. It uses two global
# arrays sm_arry and sm_prll_node_list
# which are both set in main proc prll_rsh
# sm_prll_node_list basically is the 
# node_list argument passed in the 
# prll_rsh

global sm_prll_arr
 
# Using both the above global arrays which 
# were set in proc prll_rsh

foreach element [array names sm_prll_arr *,status] { 
    
       if {[lempty $sm_prll_arr($element,status)]} {
	   set sm_prll_arr($element,status) HUNG
       }
   }

}


# Send the specified signals to the specified processes on the specified
#   cluster node.
# procArrayRef - the name of an array variable. The indices are service
#                names or IDs.  (Service names are listed here:
# www.unx.dec.com/visible/access/dispatch.cgi/sysman_forum/docProfile/100811/d20000314185046/t100811.htm)
#                Array values are the signal name or number to be
#                sent, including the "-".
# nodeName - name of the cluster node running the specified processes.
#            Defaults to the current node.

proc sm_signalProcs {procArrayRef {nodeName ""}} {
    upvar $procArrayRef  procArray

    foreach process [array names procArray] {

	# if process specified by name, find ID(s)
	if { ! [ctype digit $process]} {
	    catch {unset tmpArr}
	    set tmpArr($process) ""
	    sm_getProcIds tmpArr $nodeName
	    set tmp $tmpArr($process)
	    set pids ""
	    foreach pair $tmpArr($process) {
		lappend pids [lindex $pair 0]
	    }
	    
	} else {
	    set pids $process
	}

	if {[cequal $nodeName ""]} {
	    kill $procArray($process) $pids
	} else {
	    sm_rsh $nodeName "kill $procArray($process) $pids"
	}
    }

}

#
# Procedure: 
#     prll_rsh_specific - parallel rsh with a specific command for each
#     node. This routine is a variant of prll_rsh. It is useful for
#     when you want to send the same command to each node, but the 
#     command is slightly specific in some way, either has a PID in it,
#     or, has a '-focus nodename' on it, etc.  This procedure has the
#     same problems, benefits and limitations as prll_rsh.  Only the
#     mechanism to get node name and command has been modified so that
#     we could do things specific to each node instead of the same
#     command.
# 
# Inputs: 
#       command_array - array where the names of the elements of the array
#		    are the hosts to execute commands on. The value of 
#		    each array entry is the specific command to execute.
#		    e.g. set command_array($node) "kill -9 $pid"
#	time_out  - time to wait (in milliseconds) before declaring the 
#		    rsh "hung" and returning an error status for that host. 
#	cl_int	  - if true, all nodes in node_list are cluster members and
#		    the cluster interconnect will be used for all rsh's. 
#		    
# Outputs: 
#       None
# Returns:
#	return value is a keyedlist with 3 keys of the form node-status, 
#	node-output, node-error for each node in the node_list. 
#	node-status will contain the exit status of the command
#	node-output will contain the stdout of the command (if any)
#	node-error will contain the stderr of the command (if any)
#       
# Notes:
#
#	Since rsh is used to execute <cmd_name> on remote hosts, 
#	there are several annoying restrictions:
#	    1) Rsh doesn't separate a commands stdout and stderr. It
#	       opens, two sockets, one for each, and displays the output
#	       as it's available. This causes stdout and stderror output
#	       to be combined/mixed together. IOW, the output from
#	       rsh cmd and the same command executed locally can be different.
#	       We could compensate for this by redirecting all the output
#	       to stderror or stdout at the expense of distinguishing 
#	       between the two. We don't do that currently.
#	    2) rsh does not return the exit status of the command it
#	       executes. Instead, it returns it's own status - whether
#	       it was able to successfully connect to rshd on the remote
#	       system and execute commands. In order to get back the real,
#	       remote status, we append echo $? to the command. That
#	       puts the exit status in the command output and we parse
#	       it out. Because of 1) above, it's a bit trickier than
#	       just reading the last line of output. See the code. 
#
#	A global array sm_prll_arr is used in this proc. 
#	This array is indexed by node,status, node,output
#	and node,error as the variable names. The
#	array has variable, value pairs.
#	Output, status and error are got from the 
#	bgexec. The output, status and error from 
#	bgexec are further parsed to get the correct 
#	output, error and status for each command 
#	executed on each node. 
#
#	This array holds one other variable node,result 
#	which is used to hold the exit status of bgexec 
#	command 
#
# Restrictions:
#	Currently, there is an artifical restriction that all the nodes
#	must be in the same domain. This is because unqualified node names
#	are used in resturn stati, keyed list indexes, etc. 
#	[pjw: easy bug to fix however] 
#
#
proc prll_rsh_specific { command_arrayref time_out cl_inct } {

    upvar $command_arrayref command_array

    set local_host [lindex [split [id host] .] 0]
   
    # Global array which holds the values of 
    #  status, output and error for
    # a particular node for a particular command
    # Syntax of the index would be arr($hostname,status),
    # arr($hostname,output)
    # and arr($hostname,error)
    
    global sm_prll_arr
   
    # Unsetting global array before starting
    if {[info exists sm_prll_arr]} {
	unset sm_prll_arr
    }
   
#    set real_list ""
#    foreach node $node_list {
#	lappend real_list [lindex [split $node .] 0]
#    }

    # clear out retval in case there is a problem with command_array
    set retval {}
   
    for_array_keys element command_array { 
	# Check for cluster membership only when 
	# user sets the cl_inct to true

	if {[cequal $cl_inct "true"]} {
	    if { [catch { set rc [cluster member status \
		$element]} msg] != 0} {
	 
	    # hostname not in cluster so just let the user 
	    #know this fact
	    # no error messages..
		keylset retval $element-status 1
		keylset retval $element-output ""
		keylset retval $element-error "Node $element \
                              not in cluster"
	    
	    # Don't want any further processing for this 
	    # node
		continue
	    }

	    # hostname is in cluster but is down
	    # So let the user know of this fact
	    if { $rc == 0 } {
		keylset retval $element-status 1
		keylset retval $element-output ""
		keylset retval $element-error "Node $element is down"
		continue
	    }
	    # Cluster membership check ends 
	}

	# Initializing the global array variables
	 set sm_prll_arr($element,status) ""
	 set sm_prll_arr($element,output) ""
	 set sm_prll_arr($element,error) ""
	
	# Parsing the hostname and extracting the unqualified name
	# !! This wouldn't work if the list nodes are in different
	# domains
	# !! for now all the nodes in the cluster belong 
	# to a single domain

	# If the cl_inc is true then use cluster interconnect
	# else use the name of the node for rsh 
	if {[cequal $cl_inct "true"]} {
	    set node_intercnct [cluster member interconnect \
			$element]
	} else {
	    set node_intercnct $element
	}

	# Checking to see if running on a local node 
	# and executing locally if
	
	if {[info exists sm_prll_arr($element,result)]} {
	    unset sm_prll_arr($element,result)
	}
	
	# Note:- If the command is invalid command then there are 
	# 2 sceanrios. On the local node bgexec will complain and 
	# exit out without doing anything. In this case we can 
	# check the exit code of bgexec and do info exists, if not
	# then the command didn't execute and hence no point 
	# waiting for the status
	# since this is never going to happen.

	# In the remote node case, even though the command 
	# is invalid, 
	# since rsh is valid
	# we are going to get the status, output and error, 
	# so in this case we can safely wait
	# on the status variable. Also I am embedding the 
	# return value between two @@. Rsh has two independent
	# sockets for collecting
	# stderr and stdout. And rsh reports the stderr/stderr
	# in a single return value. Since stderr and stdout 
	# are on different sockets
	# the order in which rsh collects the data will
	# be different.
	
	# get the command for this node
	set cmd_name [set command_array($element)]

	if {![cequal $element $local_host]} {	;# If not the local host

	

	    # NOTE:- The weird "echo "command" | rsh ksh -c syntax 
	    # is intentional. Since cmd_name can contain
	    # completely arbitrary text, such as quotes, curlies, 
	    # semicolons, etc., it is critical that we prevent tcl 
	    # from eval'ing these arguments or otherwise trying 
	    # to parse them. The more natual 
	    #      rsh ksh -c "$cmd_name\;echo |$?"
	    # flat out does not work. No combination of curlies, 
	    # or quoting can make it work. 

	    set sm_prll_arr($element,result) \
	    [bgexec sm_prll_arr($element,status) \
		-output sm_prll_arr($element,output) \
		-error sm_prll_arr($element,error) \
		echo "$cmd_name\; echo @@\$?@@" | \
		rsh  $node_intercnct ksh -c &]

	} else {	;# on the local node. No rsh needed

	    # Note: in this local node case, bgexec acts differently
	    # in the case of an illegal command. It simply doesn't
	    # set any of the it's output variables at all. Catch this 
	    # and detect the missing result below
	     catch {set sm_prll_arr($element,result) \
		[bgexec sm_prll_arr($element,status) \
		    -output sm_prll_arr($element,output)\
		    -error sm_prll_arr($element,error) \
		    echo "$cmd_name" | ksh -c &] } msg

	}
    }

    # The reason for the following piece of code is because in 
    # some cases the status  might never be updated.. 
    # In such cases the vwait will have to forcibly time out vwait
    # The way to do this is to update the status variable with 
    # some value. Here after_script is being called

    # If the command is long-lived or hangs, we have a built in 
    # timer to cancel it. This is user customizable. When time_out is 
    # "none" we wait forever. Otherwise an after script kicks in to 
    # nuke the stuck bgexec's. 
    if {![cequal $time_out "none"]} {
	set after_id [after $time_out sm_after_script]
    }
   
    # If here, all the commands have been bgexec'd and we
    # we need to wait for them to complete. bgexed updates its 
    # status when the command completes so we vwait on that

    for_array_keys element command_array {    ;# For each node

	# If the interconnect is set and ((node not in cluster)||
	# (node down) then continue... we don't have to do
	# any more checks for this node. rc is set from the
	# previous loop. We can only check for the
	# status of cluster members.. If not cluster members
	# then we cannot check if they are alive or dead.
	# If the non cluster node is dead, this proc 
	# will timoeout after the time_out interval is reached 

	if {[cequal $cl_inct "true"]} {
	    if { [catch { set rc [cluster member status \
		$element]} msg] != 0} {
		# Node name not in cluster
		continue
	    }
	     # hostname is in cluster but is down
	    # So just continue in this scenario also
	    if { $rc == 0 } {
		continue
	    }
	}
	

	if {[lempty $sm_prll_arr($element,status)]} {
		
	    # If this is a local node and command did not exist, 
	    # the the bgexec return value, sm_prll_arr($element,result),
	    # which usually holds the pid of the forked process, would
	    # be empty. 
	    
	    
	    if [info exists sm_prll_arr($element,result)] {  ;# If got a bgexec pid
		# Wait patiently for the command to complete. IOW, vwait
		# blocks waiting for the variable to change
		vwait sm_prll_arr($element,status)
		

		# If we reach here, the vwait for this node has completed
		# Now process check the return codes

		# We had embedded the exit code in between two @@.
		# Now we have to extract the real exit code of the
		# command executed remotely

		if {![cequal $element $local_host]} {	;# If remote node

		    # This is the hard case. We appended:
		    #	echo @@$?@@ to the rsh'd command so 
		    # we'd get the commands exit status (which 
		    # rsh doesn't give us). We now have to parse
		    # this out of the output stream. The hard work
		    # is done in extract_exitcode
		    # Note that bgexec returns a list as its status. It's
		    # in the form:
		    #	EXITED <pid > <exit status> "child completed normally"
		    # where pid of the forked bgexec process. 
		    # We return just the process's exit status to the caller
		    set temp_out [lindex [array get sm_prll_arr \
		         $element,output] 1]
		    set real_status [lindex [extract_exitcode \
				$temp_out] 0]
		    keylset retval $element-status $real_status
		    keylset retval $element-output \
			[lindex [extract_exitcode $temp_out] 1]
		
		} else {	;# local node case
		
		    # This is the easy case cause we have the return status
		    # There is not need to parse the output for it. 
        	    keylset retval $element-status \
			[lindex [lindex [array get \
			sm_prll_arr $element,status] 1] 2]
		    
		    keylset retval $element-output [lindex \
		     [array get sm_prll_arr $element,output] 1]

		}  ;# end if remote host

		# Fill in the stderr output of the command
		keylset retval $element-error [lindex \
		    [array get sm_prll_arr $element,error] 1]


	    } else {		;# bgexec failed - no pid
	

		# This is the case in which the command was invalid and hence
		# it cannot execute. This happens only for the local node.
		# This bgexec exits before executing the command as the shell
		# thinks this is an invalid command
		keylset retval $element-status 1
		keylset retval $element-output ""
		keylset retval $element-error "Error:\
                can't execute \"$cmd_name\""
	    } ;# end if got a bgexec pid

	} else {	;# status already filled in - bgexec completed

	    # Real fast execution cases --> No vwait in this case The case when
	    # the status has been updated and didn't have vwait. Output can be
	    # from local or remote node.  If the output is from remote node we
	    # need to extract the real output and real exitcode.

	    if {![cequal $element $local_host]} {	;# If remote node
		set temp_out [lindex [array get sm_prll_arr \
		         $element,output] 1]
		
		set real_status [lindex [extract_exitcode $temp_out] 0]
		keylset retval $element-status $real_status
		keylset retval $element-output [lindex \
				[extract_exitcode $temp_out] 1]
		 
	    } else {					;# local node

		keylset retval $element-status [lindex \
		    [lindex [array get sm_prll_arr $element,status] 1] 2]
		keylset retval $element-output [lindex \
		    [array get sm_prll_arr $element,output] 1]
	    }

	    # Same error processing for local and remote
            keylset retval $element-error [lindex \
		[array get sm_prll_arr $element,error] 1]

	} ;# end if bgexec status is ""

    } ;# end for each node
    

    # We have now vwaited for every node. If we are here then 
    # everything finished normally or we exceeded the timeout and 
    # the after proc has triggered. The after proc sets all the bgexec status
    # variables which tells bgexec to kill the process and forces
    # the vwait to complete. If we are still within the timeout period
    # then we need to cancel the vwait. Cancelling an already completed
    # after is OK. 
    if {![cequal $time_out "none"]} {
	after cancel $after_id
    }


    return $retval		;# Return the keyed list

} ;# end prll_rsh_specific


############################################################################
#
# Class:
#	SM_app_mutex
#
# Abstract:
#	Class definitions for implementing simple application mutual
#	exclusion. In other words, if you want to prevent two users from
#	attempting to do the same management operation at the same time.  These
#	metods are conceptually similar to today's "locking" of /etc/passwd. In
#	order to prevent multiple user's from simultaneously operating on
#	/etc/passwd, a crude file semaphore is used. SM_app_mutex
#	uses flock on a lock file to prevent simultaneous access.
#
# Notes:
#
# Class Description:
#	The methods for this class are:
#	   - init    - initializes and creates a lock object
#	   - lock    - try and get the lock
#	   - unlock  - release the lock
#	   - get_pid - for long lived locks, get the pid of the lock owner
#		       this is currently NYI
#
############################################################################


Class SM_app_mutex


##########################################################################
#
# Method: 
#	SM_app_mutex init - this instproc initializes a lock object
#	We save the lock filename and init the i18n catalog
# 
# Inputs: 
#	lock_file - name of the lock file to create. By convention, 
#		    it should be /var/run/<name>.pid where name
#		    is indicative of your application (e.g. /var/run/inetd.pid)
#
# Outputs: 
#	None
# 
# Returns:
#	None
#
# Notes: 
# 
# 
##########################################################################

SM_app_mutex instproc init { lock_file } {

    $self set lock_file $lock_file

    SM_debug sm_appmutex_dbg SM_APP_MUTEX_DEBUG	;# Create debugging obj

} ;# end SM_app_mutex init


##########################################################################
#
# Method: 
#	SM_app_mutex lock - this instproc attempts to create 
#	a /var/run/ file with our pid and take out an exclusive lock 
#       on it. If we can get the exlusive lock (via fcntl) then 
#	we know that there is no active process attempting
#	to compete with us. Note that if a processes crashes or is 
#	terminated, then any locks are released. This is nice cause
#	we then don't have the standard /var/run issue of wondering
#	whether the pid is valid. In fact, we don't even necessarily 
#	care about the pid. The caller could just tell the user that 
#	there is another user competing with him. 
# 
# Inputs: 
#	None
# Outputs: 
#	None
# 
# Returns:
#	The return value is a two element status list. The first element
#	is the status code:
#	    0 for success
#	    1 for failure
#	The second element is an error message. This will be empty
#	if there is no error text associated with the error
#
# Notes: 
# 
##########################################################################

SM_app_mutex instproc lock { } {

    $self instvar lock_file

    # Do an open on the lock file. Note:
    #  - you have to open it in order to perform fcntl locking. IOW, 
    #    you need the fd
    #  - you HAVE to use the WRONLY syntax and now the "w" syntax below
    #    or else the lock file get's truncated! tcl underneath is doing
    #	 an fopen and the w is getting mapped to be an open with truncate.
    #	 the flags in capitals are the real fopen flags and just get
    #	 used verbatim. 


    if { [catch { open $lock_file {WRONLY CREAT} 0600} fd] } {
	# pjw: Note: assuming the lock file is in a directory like /var/run/,
	# one easy way to test this code path is to 
	# run without privs. The error will be permission denied. 


	# This message uses the /var/run filename and exact
	# error we got on the open
	set msg [sm_catgets I18N_catobj \
			i18n_app_mutex_error_cannot_create_lock_file_txt]
	set msg [format $msg $lock_file $fd]

	sm_appmutex_dbg puts "SM_app_mutex: couldn't open $lock_file, $msg"

	return [list 1 "$msg"]

    } else {			;# File creation worked!!!

	# So now we are in a race to lock it

	if { [flock -write -nowait $fd] == 1 } {;# If we got the lock!


	    # Save the fd for the unlock operation where we close the file
	    # Also, if our lock object has this instvar set then we 
	    # we know have the lock. We'll use this in the unlock to 
	    # prevent a rogue caller from trying to unlock when he
	    # never got the lock in the first place
	    $self set lock_file_fd $fd

	    puts $fd [pid]			;# Write our pid to the file

	    # Make sure to catch flush error (e.g. disk space full. 
	    # The open can succedd but the flush will fail)
	    if { [catch {flush $fd} result] } {	      ;# If flush fails

		# This message uses the /var/run filename and exact
		# error we got on the open
		set msg [sm_catgets I18N_catobj \
			i18n_app_mutex_error_cannot_create_lock_file_txt]
		set msg [format $msg $lock_file $result]

	        sm_appmutex_dbg puts "SM_app_mutex: can't get lock: $msg"

		return [list 1 "$msg"]
	    } 


	} else {				;# Else cannot lock
	    sm_appmutex_dbg puts "SM_app_mutex: can't get lock"
	    return [list 1 ""]			;# Return error
	}

    } ;# end if we can open the lock file

    return [list 0 ""]				;# Success!

}	;# end SM_app_mutex lock

##########################################################################
#
# Procedure: 
#	SM_app_mutex get_pid - if the lock file exists, read
#	the pid out of it. 
#	Note that this only make sense for long lived applications. 
#	If you are using the app mutex in a flush then typically
#	that's short lived and if another competing app saw the lock
#	was there and went to get the pid, the file would be gone
#	by the time they tried to read it. If the app is long lived, 
#	then there is less of an issue. If there is a race and
#	we can't read the pid from the file (either cause it's 
#	not written yet or because the file was deleted, then 
#	we return "unknown" for the PID. 
# 
# Inputs: 
#	None
#
# Outputs: 
#	None
# 
# Returns:
#	pid - the pid in the lock file or "unknown" if 
#	      we opened the file but couldn't find a pid
#
# Notes: 
#	- It's assumed this proc is called only when the lock file exits. 
#	  IOW, the intialization of the SM_app_mutex object succeeded
#	- We know the i18n catalog is initialized cause it's done
#	  but our init instproc
# 
# 
##########################################################################

SM_app_mutex instproc get_pid { } {

    $self instvar lock_file


    # Get pid from file so we can put it in the message

    if { [catch {open $lock_file RDONLY} fd ] } {	;# If open fails

	sm_appmutex_dbg puts \
		"SM_app_mutex: $lock_file exists but can't open it"

	# Hmmm... the file exists but we can't open it. I'm not
	# sure this can ever happen but just in case, 
	# we'll display the message about the other process
	# using a pid of "unknown
	set pid [sm_catgets I18N_catobj i18n_app_mutex_warning_pid_unknown_txt]

    } else {					;# Open worked. 

	sm_appmutex_dbg puts \
		"SM_app_mutex: $lock_file open worked. Getting pid..."

	# But if the gets fails or the pid is blank
	if {[gets $fd pid] == -1 || $pid == ""} {	

	    # Then fall back to using "unknown"		
	    set pid \
		[sm_catgets I18N_catobj i18n_app_mutex_warning_pid_unknown_txt]

	} else {
		# else the pid is set properly by the gets above
	}

	sm_appmutex_dbg puts "SM_app_mutex:pid is '$pid'"

	catch {close $lock_file} 

    } ;# end if open fails


    return $pid


}	;# end SM_app_mutex get_pid

##########################################################################
#
# Procedure: 
#	SM_app_mutex unlock - remove our lock file when we're done
#	Note that deleting this file should relinquish our fcntl exclusive
#	write lock on it. 
# 
# Inputs: 
#	None
#
# Outputs: 
#	None
# 
# Returns:
#	{} for success
#	error message string for failure
#
# Notes: 
# 
# 
##########################################################################

SM_app_mutex instproc unlock { } {


    # If the caller doesn't have the lock fd then just quitly ignore him
    if { [$self info vars lock_file_fd ] == "" } {
	sm_appmutex_dbg puts "SM_app_mutex unlock: you never got the lock!!"
	return {}
    }


    # Make SURE to delete it before the close to avoid race conditions
    # with locking! The delete should just unlink it so we 
    # still have a lock. 
    file delete [$self set lock_file]
    catch {close [$self set lock_file_fd]}

}	;# end SM_app_mutex unlock

#
# Procedure:
#	sm_catgets - get a message from a catalog. This is usually 
#	quite a simple thing to do except if you are generic utility
#	code (like this code) and the caller's catalog could be a 
#	SUIT catalog or a generic catalog. In that case, the methods 
#	are different. This routine does the right thing by detecting 
#	the catalog object type and using the proper methods. 
#
# Inputs:
#	cat      - SUIT or mcl message catalog object
#	msg_id   - message identifier (e.g. foo_lbl or foo_msg)
#	args     - argument to populate the msg with.
# Outputs:
# 	None
# Returns:
#	message fetched from the catalog
# Notes:
#	The callers catalog object must be initialized before using this proc
#	The I18N catalog is special cased though so we do init it if needed
#
#	[pjw: we need to move the I18N_catobj initialization to sysmansh
#	 so it's not done in a bazillion places all over SUIT/MCL/here and 
#	 the user's app.
#	 Also, the SUIT catalog obj should be a subclass of the 
#	 standard catalog object so it shares the same methods. That
#	 would make this nonsense unneccessary]
#
proc sm_catgets { cat msg_id args } {

    # Special case the I18N catalog. It it's not init'd then do that now
    if { $cat == "I18N_catobj" && [info command I18N_catobj] == "" } {
	_Catalog I18N_catobj
	I18N_catobj catopen i18n_motif_shared_text
    }

    # Make sure we're passed a valid catalog
    if { [catch {$cat info class}] } {
	return \
	  "sysman_utils: Error - message catalog $cat has not been initialized"
    } 


    switch [$cat info class] {
	"_Catalog" {
	    # If the message is from the i18n catalog, then 
	    # do the lookup there
	    set i18n [lindex [split $msg_id "_"] 0]	;# Get the msg prefix

	    if { $i18n == "i18n" } {			;# If it's an i18n msg
							;# Get from i18n cat
		return [eval I18N_catobj catgets $msg_id $args]	
	    } else {					;# Else caller's cat
		return [eval $cat catgets $msg_id $args]
	    } 
	} 

	"_UIT_Catalog" {
	    # If it's a SUIT catalog, then use the catget method
	    # (note the singlar). This method expects a message prefix
	    # and a suffix. The final message fetched is "prefix_suffix"
	    # We transform the caller's msg_id into this format
	    # Note that SUIT will first look in the shared i18n catalog
	    # and then the SUITlet's catalog. 

	    set tmp [split $msg_id "_"]		;# Split in sections
	    set suffix [lindex $tmp end]	;# Get the msg suffix
	    # Ouch. Form the prefix by rejoining all but the suffix
	    # Lists are 0 based but llength isn't so subtract 2 to ignore
	    # the last list element
	    set prefix [join [lrange $tmp 0 [expr [llength $tmp] - 2]] "_" ]

	    return [eval $cat catget $prefix $suffix $args]
	} 
	
	default {
	    return \
	       "sysman_utils: Error - $cat is not a known catalog object type."
	} 
    } 

} ;# end sm_catgets

proc sm_isOSVersion { version  {node {}} } {
##
##  arg1 = version string to look for, such as 5.0a, 5.1, etc.
##  arg2 = (optional) defaults to blank -- this is the node
##         name on which we need to know what OS version is running.
##         If node name is blank but we are in a cluster and focused
##	   then that is the node that is used.
##
##  Returns:
##  1 = if version provided is found in the OS version string
##  0 = if version provided is NOT found in the OS version string
##
## Accepts the version string and optional node name, and tries to 
## find the version passed in by the user in the OS_version in the 
## cluster info.  This is useful in a rolling upgrade, to determine
## if the node you are focusing on is running an earlier OS version.
##

global SysmanFocusHost SysmanOnCluster

	if { !$SysmanOnCluster } {
  		return 0
	}

	set host [id host]
	set host [lindex [split $host .] 0]

	# if a node name is provided...ensure that it 
	# is not fully qualified.
	if {[llength $node] != 0} {
        	set node [lindex [split $node .] 0]
	}

	# Get a version of SysmanFocusHost that is not 
	# fully qualified so that we can
	# compare it to $host. Don't want to change 
	# global SysmanFocusHost itself.

	set FocusHost $SysmanFocusHost          ;# Incidentally, could be null

	# if $SysmanFocusHost is not null, extract the unqualified part.
	if {[llength $SysmanFocusHost] != 0} {
    		set FocusHost [lindex [split $SysmanFocusHost .] 0]
	}

	# get node information, such as os_version
	if {![cequal $node ""]} {
		set info [cluster member info $node]
	} elseif {[llength $FocusHost] != 0 } { 
		set info [cluster member info $FocusHost]
	} else {
		set info [cluster member info]
	}

	# now that we have node information, lets find
	# out if the node is running the OS version
	# that the user is asking about....   
	set listinfo [split $info]

	# get the base_version string value and length
	# the base_version in the info string is followed
        # by the clu_version.  So, by finding where the
	# base_version starts and where it ends, we can
	# search just the base_version to look for the
	# version information that we want.
	set low [lsearch $listinfo base_version]
	set high [lsearch $listinfo clu_version]
	set version_string [lrange $listinfo \
		[expr {$low + 1}] [expr {$high - 1}]]
	set limit [llength $version_string]

	# now check to see if the version the user
	# is looking for is in the version_string
	for {set i 0} { $i < $limit} {incr i} {
		set value [lindex $version_string $i]
		if {[regexp $version $value]} {
			return 1
		} 
	} 

  	return 0
}

proc sm_JobARG { jobname {-uroot} {node {}} } {
##
##  arg1 = name of the process (job) 
##  arg2 = (optional) defaults to blank -- this is the node
##         name on which we need to perform the ps command. 
##
## Gets the job's ARGs from the ps output.  Returns the command on success or
## -1 on failure.  
##

global SysmanFocusHost

set host [id host]
set host [lindex [split $host .] 0]

# if a node name is provided...ensure that it is not fully qualified.
if {[llength $node] != 0} {
        set node [lindex [split $node .] 0]
}

# Get a version of SysmanFocusHost that is not fully qualified so that we can
# compare it to $host. Don't want to change global SysmanFocusHost itself.

set FocusHost $SysmanFocusHost          ;# Incidentally, could be null

# if $SysmanFocusHost is not null, extract the unqualified part.
if {[llength $SysmanFocusHost] != 0} {
    set FocusHost [lindex [split $SysmanFocusHost .] 0]
}

# if we need to look at a particular node, either focused or a cluster member, 
# we need to check to see if we are calling the local host.  If not, then
# use rsh, otherwise, just look locally.
if {[llength $node] != 0  &&  [cequal $node $host] != 1} {
    if { ![cluster member status $node] } {
	return -1
    }
    set node_intercnct [cluster member interconnect $node]
    set fd [open "|rsh -n $node_intercnct /sbin/ps -e -o pid,command" r ]
} elseif {[llength $FocusHost] != 0 && 
          [cequal $FocusHost $host] != 1} {
    if { ![cluster member status $FocusHost] } {
	return -1
    }
    set focus_intercnct [cluster member interconnect $SysmanFocusHost]
    set fd [open "|rsh -n $focus_intercnct /sbin/ps -e -o pid,command" r ]
} else {
    set fd [ open "|/sbin/ps -e -o pid,command" r ]
}

  set retval -1
  while { [gets $fd line] >= 0 } {
    regsub -all "\[ \t\]+" $line " " line
    regsub -all "^\ " $line "" line
    set lline [split  $line]

	 # Need to check the second and third items in the list to see if
	 # either one of them matches the jobname.  This is necessary 
	 # because if a process is invoked by a shell script then the name
	 # of the shell will appear in the second list item and the name of 
	 # the process will appear in the third item.  If the process is not
	 # invoked via a shell, then the process name will be the second item
	 # in the list.
	 # 
	 set list {1 2} ;# Note the first item in the list starts at index 0
	 foreach i $list { 
    	if { [lindex $lline $i ] == $jobname } {
      	set retval [lrange $lline 1 end]	;# return jobs PID 
    	} elseif { [file tail [lindex $lline $i ]] == $jobname } {
      	set retval [lrange $lline 1 end]
    	}
	 } 
  }
  close $fd
  return $retval
}


# Note: The routines  sm_GetIPAddr and sm_GetHostName are 
# temporary routines and these routines will exist
# till we have Todd's Advanced Mcl's. 

## This proc returns the IP Address given the Hostname
## Takes in one argument hostname/alias
## Example sm_GetHostName crusher.unx.dec.com
## Example sm_GetHostName crusher
## If the hostname is not found in the local /etc/hosts file
## the proc returns ""

proc sm_GetIPAddr {HostName} {
        set firstName [lindex [split $HostName .] 0]
        set fid [open /etc/hosts r]
        while {[gets $fid line] >=0} {
	    # Ignore comments
	   if {[lsearch -regexp $line "^\#"]<0} {
	       if {[set idx [lsearch -exact $line $HostName]] >= 0 \
		       || [set idx [lsearch -exact $line $firstName]] >= 0} {
		   set linesplit [split $line]
		   return [lindex $linesplit 0]
	       }
	   }
        }
	return ""
} 

## This proc returns the HostName given the IPAddress 
## Takes in one argument
## Example sm_GetHostName crusher.unx.dec.com
## If the IPAddress is not found in the local /etc/hosts file
## this proc returns "".

proc sm_GetHostName {IPAddr} {
     set fid [open /etc/hosts r]
     while {[gets $fid line] >=0} {
	 # Ignore comments
	 if {[lsearch -regexp $line "^\#"]<0} {
	     if {[set idx [lsearch -exact $line $IPAddr]] >= 0} {
                set linesplit [split $line]
                set inx 1
		 # There has to be better logic than this.
		 # This magic number 80 means that a single line in 
		 # /etc/hosts should have less then 80 aliases. 
		 # If more than 80, assuming that the file is corrupted.
		 
		 while {[cequal [lindex $linesplit $inx] ""] &&  $inx < 80} {
                        incr inx
                }
                if {$inx < 80} {
                        return [lindex $linesplit $inx]
                }
	     }
	 }
     }
     return ""
}


##---------------------------------------------------------------------------
 #
 # Proc:  sm_CalledbySuit
 #
 #      Used by mcl code to determine whether the mcl code
 #      was invoked by SUIT or by a CLI call.
 #
 # Inputs:
 #      None
 #
 # Outputs:
 #      None
 #
 # Returns:
 #      0 - not called by SUIT
 #      1 - was called by SUIT
 #
 # Notes:
 #      This code is useful for mcls which need to display
 #      different messages based on where called, or mcls
 #      which need not capture output since called on
 #      command line.
 #
##---------------------------------------------------------------------------

proc sm_CalledBySuit {} {

        if { [info globals _UIT_g_main] != "" } {
                return 1
        } else {
                return 0
        }

}                               ; ## end routine sm_CalledBySuit

