Data Mining Group - General Regression

PMML 3.2 - General Regression

Model XSD and Tag Description


  <xs:element name="GeneralRegressionModel">
    <xs:complexType>
      <xs:sequence>
        <xs:element minOccurs="0" maxOccurs="unbounded" ref="Extension" />
        <xs:element ref="MiningSchema" />
        <xs:element minOccurs="0" ref="Output" />
        <xs:element minOccurs="0" ref="ModelStats" />
        <xs:element minOccurs="0" ref="Targets" />
        <xs:element minOccurs="0" ref="LocalTransformations" />
        <xs:element ref="ParameterList" />
        <xs:element minOccurs="0" ref="FactorList" />
        <xs:element minOccurs="0" ref="CovariateList" />
        <xs:element ref="PPMatrix" />
        <xs:element minOccurs="0" ref="PCovMatrix" />
        <xs:element ref="ParamMatrix" />
        <xs:element ref="ModelVerification" minOccurs="0"/>
        <xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded"/>
      </xs:sequence>
      <xs:attribute name="targetVariableName" type="FIELD-NAME" />
      <!-- The attribute targetVariableName is optional, usage is deprecated -->
      <!-- It is for information only, anyway. -->
      <!-- Use usageType="predicted" in MiningField instead -->

      <xs:attribute name="modelType" use="required">
        <xs:simpleType>
          <xs:restriction base="xs:string">
            <xs:enumeration value="regression" />
            <xs:enumeration value="generalLinear" />
            <xs:enumeration value="multinomialLogistic" />
            <xs:enumeration value="ordinalMultinomial" />
            <xs:enumeration value="generalizedLinear" />
         </xs:restriction>
          <!-- The model type generalizedLinear is added. -->
        </xs:simpleType>
      </xs:attribute>
      <xs:attribute name="modelName" type="xs:string" />
      <xs:attribute name="functionName" type="MINING-FUNCTION" use="required" />
      <xs:attribute name="algorithmName" type="xs:string" />
      <xs:attribute name="cumulativeLink" type="CUMULATIVE-LINK-FUNCTION" />

 <!-- The attributes linkFunction, linkParameter, trialsVariable, trialsValue, distribution,
         distParameter, offsetVariable, offsetValue are added. 
         The first six are only relevant for generalizedLinear model type, 
         offsetVariable and offsetValue can be used for generalizedLinear or ordinalMultinomial. -->   
      <xs:attribute name="linkFunction" type="LINK-FUNCTION" />
      <xs:attribute name="linkParameter" type="REAL-NUMBER" />   
      <xs:attribute name="trialsVariable" type="FIELD-NAME" />
      <xs:attribute name="trialsValue" type="INT-NUMBER" />
      <xs:attribute name="distribution" >
    <xs:simpleType >
          <xs:restriction base="xs:string">
            <xs:enumeration value="binomial"/>
            <xs:enumeration value="gamma"/>
            <xs:enumeration value="igauss"/>
            <xs:enumeration value="negbin"/>
            <xs:enumeration value="normal"/>
            <xs:enumeration value="poisson"/>
          </xs:restriction >
        </xs:simpleType >
      </xs:attribute >
      <xs:attribute name="distParameter" type="REAL-NUMBER" />   
      <xs:attribute name="offsetVariable" type="FIELD-NAME" />
      <xs:attribute name="offsetValue" type="REAL-NUMBER" />
    </xs:complexType>
  </xs:element>

  <xs:element name="ParameterList">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded"/>
        <xs:element maxOccurs="unbounded" ref="Parameter" />
      </xs:sequence>
    </xs:complexType>
  </xs:element>

  <xs:element name="Parameter">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded"/>
      </xs:sequence>
      <xs:attribute name="name" type="xs:string" use="required" />
      <xs:attribute name="label" type="xs:string" />
    </xs:complexType>
  </xs:element>

  <xs:element name="FactorList">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded"/>
        <xs:element minOccurs="0" maxOccurs="unbounded" ref="Predictor" />
      </xs:sequence>
    </xs:complexType>
  </xs:element>

  <xs:element name="CovariateList">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded"/>
        <xs:element minOccurs="0" maxOccurs="unbounded" ref="Predictor" />
      </xs:sequence>
    </xs:complexType>
  </xs:element>

  <xs:element name="Predictor">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded"/>
      </xs:sequence>
      <xs:attribute name="name" type="FIELD-NAME" use="required" />
    </xs:complexType>
  </xs:element>

  <xs:element name="PPMatrix">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded"/>
        <xs:element maxOccurs="unbounded" ref="PPCell" />
      </xs:sequence>
    </xs:complexType>
  </xs:element>

  <xs:element name="PPCell">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded"/>
      </xs:sequence>
      <xs:attribute name="value" type="xs:string" use="required" />
      <xs:attribute name="predictorName" type="FIELD-NAME" use="required" />
      <xs:attribute name="parameterName" type="xs:string" use="required" />
      <xs:attribute name="targetCategory" type="xs:string" />
    </xs:complexType>
  </xs:element>

  <xs:element name="PCovMatrix">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded"/>
        <xs:element maxOccurs="unbounded" ref="PCovCell" />
      </xs:sequence>
      <xs:attribute name="type">
        <xs:simpleType>
          <xs:restriction base="xs:string">
            <xs:enumeration value="model" />
            <xs:enumeration value="robust" />
          </xs:restriction>
        </xs:simpleType>
      </xs:attribute>
    </xs:complexType>
  </xs:element>

  <xs:element name="PCovCell">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded"/>
      </xs:sequence>
      <xs:attribute name="pRow" type="xs:string" use="required" />
      <xs:attribute name="pCol" type="xs:string" use="required" />
      <xs:attribute name="tRow" type="xs:string" />
      <xs:attribute name="tCol" type="xs:string" />
      <xs:attribute name="value" type="REAL-NUMBER" use="required" />
      <xs:attribute name="targetCategory" type="xs:string" />
    </xs:complexType>
  </xs:element>

  <xs:element name="ParamMatrix">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded"/>
        <xs:element maxOccurs="unbounded" ref="PCell" />
      </xs:sequence>
    </xs:complexType>
  </xs:element>

  <xs:element name="PCell">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded"/>
      </xs:sequence>
      <xs:attribute name="targetCategory" type="xs:string" />
      <xs:attribute name="parameterName" type="xs:string" use="required" />
      <xs:attribute name="beta" type="REAL-NUMBER" use="required" />
      <xs:attribute name="df" type="INT-NUMBER" />
    </xs:complexType>
  </xs:element>

GeneralRegressionModel: marks the beginning of a general regression model. As the name says it, this is intended to support a multitude of regression models.

ParameterList: lists all Parameters. Each Parameter contains a required name, and optional label. Parameter names should be unique within the model and as brief as possible (since Parameter names appear frequently in the document). The label, if present, is meant to give a hint on a Parameter's correlation with the Predictors.

FactorList: list of factor names. Not present if this particular regression flavor does not support factors (ex. linear regression). If present, the list may or may not be empty. Each name in the list must match a DataField name or a DerivedField name. The factors are assumed to be categorical variables.

CovariateList: list of covariate names. Will not be present when there is no covariate. Each name in the list must match a DataField name or a DerivedField name. The covariates will be treated as continuous variables.

targetVariableName: name of the target variable (also called response variable). Must match a DataField name. This field is now deprecated since MiningSchema has the name of the predicted variable.

modelType: specifies the type of regression model in use. This information will be used to select the appropriate mathematical formulas during scoring. The supported regression algorithms are listed.

modelName and algorithmName can have arbitrary strings describing the specific model.

functionName can only be classification or regression.
cumulativeLink: specifies the type of cumulative link function to use when ordinalMultinomial model type is specified.

CUMULATIVE-LINK-FUNCTION data type

The definition
  <xs:simpleType name="CUMULATIVE-LINK-FUNCTION">
    <xs:restriction base="xs:string">
      <xs:enumeration value="logit"/>
      <xs:enumeration value="probit"/>
      <xs:enumeration value="cloglog"/>
      <xs:enumeration value="loglog"/>
      <xs:enumeration value="cauchit"/>
    </xs:restriction>
  </xs:simpleType>
is used for specifying a cumulative link function used in ordinalMultinomial model. Specific formulas are listed below in the scoring example.

linkFunction: specifies the type of link function to use when generalizedLinear model type is specified.

LINK-FUNCTION data type

The definition
  <xs:simpleType name="LINK-FUNCTION">
    <xs:restriction base="xs:string">
      <xs:enumeration value="cloglog"/>
      <xs:enumeration value="identity"/>
      <xs:enumeration value="log"/>
      <xs:enumeration value="logc"/>
      <xs:enumeration value="logit"/>
      <xs:enumeration value="loglog"/>
      <xs:enumeration value="negbin"/>
      <xs:enumeration value="oddspower"/>
      <xs:enumeration value="power"/>
      <xs:enumeration value="probit"/>
    </xs:restriction>
  </xs:simpleType>
is used for specifying a link function used in generalizedLinear model. Specific formulas are listed below in the scoring example.

linkParameter: specifies an additional number the following link functions need: oddspower and power.

trialsVariable: specifies an additional variable used during scoring some generalizedLinear models (see the description of scoring procedure below). This attribute must refer to a DataField or a DerivedField.

trialsValue: a positive integer used during scoring some generalizedLinear models (see the description of scoring procedure below). At most one of the attributes trialsVariable and trialsValue can be present in a model.

distribution: the probability distribution of the dependent variable for generalizedLinear model may be specified as normal, binomial, gamma, inverse Gaussian, negative binomial, or Poisson.

distParameter: specifies an ancillary parameter value for the negative binomial distribution.

offsetVariable: if present, this variable is used during scoring generalizedLinear or ordinalMultinomial models (see the description of scoring procedures below). This attribute must refer to a DataField or a DerivedField.

offsetValue: if present, this value is used during scoring generalizedLinear or ordinalMultinomial models. It works like a user-specified intercept (see the description of the scoring procedures below). At most one of the attributes offsetVariable and offsetValue can be present in a model.
PPMatrix: Predictor-to-Parameter correlation matrix. It is a rectangular matrix having a column for each Predictor (factor or covariate) and a row for each Parameter. The matrix is represented as a sequence of cells, each cell containing a number representing the correlation between the Predictor and the Parameter. The cell values are computed as follows:

For each Predictor variable v and each Parameter p, the corresponding cell value is missing (empty) if there is no correlation between v and p.

If there is a correlation between a covariate Predictor and the Parameter, the cell value is set to the exponent that the covariate is raised to in the dependency expression. Example: assuming variable jobcat is a factor and work is a covariate, the Parameter [jobcat=professional] * work * work is correlated to the covariate work, and the number that should be entered in the cell is 2 because work is present at second power in the expression.

If there is a correlation between a factor variable and the Parameter, the cell value is set to the Predictor value that determines the correlation. Example: Assuming the categories of the factor variable jobcat are professional, clerical, skilled, unskilled, the cell in the matrix that corresponds to (jobcat, jobcat=skilled) has a value of skilled.

The empty cells are not required to be present in the exported model file. All cells determined to be missing from the xml file at model parsing will be assumed to be empty. Since empty cells make up a large chunk of the matrix, this will reduce the size of the exported model.

Note the implied targetCategory attribute. This is permitted in order to allow usage of different PPMatrices for different response values in classification models. For multinomialLogistic model if any PPCell contains this attribute, the expectation is that for that particular response level, a full PPMatrix can be reconstructed from the PMML document. It is that matrix which will be used during scoring in order to get the probability (and other statistics) for the response level. By default, all target categories share the PPMatrix.

targetCategory attribute can thus be used to override the default for some or all target categories.

PPCell: cell in the PPMatrix. Knows its row name, column name, and information as described above.

PCovMatrix: matrix of Parameter estimate covariances. Made up of PCovCells, each of them being located via row information for Parameter name (pRow), row information for target variable value (tRow), column information for Parameter name (pCol) and column information for target variable value (tCol). Note that the matrix is symmetric with respect to the main diagonal (interchanging tRow and tCol together with pRow and pCol will not change the value). Therefore it is sufficient that only half of the matrix be exported. Attributes tRow and tCol are optional since they are not needed for linear regression models. This element has an optional attribute type that can take values model and robust. This attribute describes the way the covariance matrix was computed in generalizedLinear model. The robust option is also known as Huber-White or sandwich or HCCM.

ParamMatrix: Parameter matrix. A table containing the Parameter values along with associated statistics (degrees of freedom). One dimension has the target variable's categories, the other has the Parameter names. The table is represented by specifying each cell. There is no requirement for Parameter names other than that each name should uniquely identify one Parameter.

PCell: cell in the ParamMatrix. The optional targetCategory and required parameterName attributes determine the cell's location in the Parameter matrix. The information contained is: beta (actual Parameter value, required), and df (degrees of freedom, optional). For ordinalMultinomial model ParamMatrix specifies different values for the intercept parameter: one for each target category except one. Values for all other parameters are constant across all target variable values.

General Regression Samples: Multinomial Logistic Example

Here is the information about the variables:

Name      Type    Number of    Categories (numeric coding in parentheses)
  categories    

JOBCAT    Target      7       Clerical(1), Office trainee(2), Security officer(3),
                              College trainee(4), Exempt employee(5),
                              MBA trainee(6), and Technical(7)
SEX       Factor      2       Males(0), and Females(1)
MINORITY  Factor      2       White(0), and Nonwhite(1)
AGE       Covariate
WORK      Covariate

The Parameter estimates are displayed as follows:

The PPMatrix is:

Parameter                         SEX    MINORITY   AGE    WORK
Intercept
[SEX = 0]                          0
[SEX = 1]                          1
[MINORITY = 0]([SEX = 0])          0        0
[MINORITY = 1]([SEX = 0])          0        1
[MINORITY = 0]([SEX = 1])          1        0
[MINORITY = 1]([SEX = 1])          1        1
AGE                                                  1
WORK                                                         1

This Predictor-to-Parameter combinations mapping is the same for each target variable category. The corresponding XML model is:


  <?xml version="1.0" ?>
  <PMML version="3.2" xmlns="https://www.dmg.org/PMML-3_2" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
    <Header copyright="dmg.org"/>
    <DataDictionary numberOfFields="5">
      <DataField name="jobcat" optype="continuous" dataType="double"/>
      <DataField name="minority" optype="continuous" dataType="double"/>
      <DataField name="sex" optype="continuous" dataType="double"/>
      <DataField name="age" optype="continuous" dataType="double"/>
      <DataField name="work" optype="continuous" dataType="double"/>
    </DataDictionary>

    <GeneralRegressionModel
          targetVariableName="jobcat"
          modelType="multinomialLogistic"
          functionName="classification">

      <MiningSchema>
        <MiningField name="jobcat" usageType="predicted"/>
        <MiningField name="minority" usageType="active"/>
        <MiningField name="sex" usageType="active"/>
        <MiningField name="age" usageType="active"/>
        <MiningField name="work" usageType="active"/>
      </MiningSchema>

      <ParameterList>
        <Parameter name="p0" label="Intercept"/>
        <Parameter name="p1" label="[SEX=0]"/>
        <Parameter name="p2" label="[SEX=1]"/>
        <Parameter name="p3" label="[MINORITY=0]([SEX=0])"/>
        <Parameter name="p4" label="[MINORITY=1]([SEX=0])"/>
        <Parameter name="p5" label="[MINORITY=0]([SEX=1])"/>
        <Parameter name="p6" label="[MINORITY=1]([SEX=1])"/>
        <Parameter name="p7" label="age"/>
        <Parameter name="p8" label="work"/>
      </ParameterList>

      <FactorList>
        <Predictor name="sex" />
        <Predictor name="minority" />
      </FactorList>

      <CovariateList>
        <Predictor name="age" />
        <Predictor name="work" />
      </CovariateList>

      <PPMatrix>
        <PPCell value="0" predictorName="sex" parameterName="p1"/>
        <PPCell value="1" predictorName="sex" parameterName="p2"/>
        <PPCell value="0" predictorName="sex" parameterName="p3"/>
        <PPCell value="0" predictorName="sex" parameterName="p4"/>
        <PPCell value="1" predictorName="sex" parameterName="p5"/>
        <PPCell value="1" predictorName="sex" parameterName="p6"/>
        <PPCell value="0" predictorName="minority" parameterName="p3"/>
        <PPCell value="1" predictorName="minority" parameterName="p4"/>
        <PPCell value="0" predictorName="minority" parameterName="p5"/>
        <PPCell value="1" predictorName="minority" parameterName="p6"/>
        <PPCell value="1" predictorName="age" parameterName="p7"/>
        <PPCell value="1" predictorName="work" parameterName="p8"/>
      </PPMatrix>

      <ParamMatrix>
        <PCell targetCategory="1" parameterName="p0" beta="26.836" df="1"/>
        <PCell targetCategory="1" parameterName="p1" beta="-.719" df="1"/>
        <PCell targetCategory="1" parameterName="p3" beta="-19.214" df="1"/>
        <PCell targetCategory="1" parameterName="p5" beta="-.114" df="1"/>
        <PCell targetCategory="1" parameterName="p7" beta="-.133" df="1"/>
        <PCell targetCategory="1" parameterName="p8" beta="7.885E-02" df="1"/>
        <PCell targetCategory="2" parameterName="p0" beta="31.077" df="1"/>
        <PCell targetCategory="2" parameterName="p1" beta="-.869" df="1"/>
        <PCell targetCategory="2" parameterName="p3" beta="-18.99" df="1"/>
        <PCell targetCategory="2" parameterName="p5" beta="1.01" df="1"/>
        <PCell targetCategory="2" parameterName="p7" beta="-.3" df="1"/>
        <PCell targetCategory="2" parameterName="p8" beta=".152" df="1"/>
        <PCell targetCategory="3" parameterName="p0" beta="6.836" df="1"/>
        <PCell targetCategory="3" parameterName="p1" beta="16.305" df="1"/>
        <PCell targetCategory="3" parameterName="p3" beta="-20.041" df="1"/>
        <PCell targetCategory="3" parameterName="p5" beta="-.73" df="1"/>
        <PCell targetCategory="3" parameterName="p7" beta="-.156" df="1"/>
        <PCell targetCategory="3" parameterName="p8" beta=".267" df="1"/>
        <PCell targetCategory="4" parameterName="p0" beta="8.816" df="1"/>
        <PCell targetCategory="4" parameterName="p1" beta="15.264" df="1"/>
        <PCell targetCategory="4" parameterName="p3" beta="-16.799" df="1"/>
        <PCell targetCategory="4" parameterName="p5" beta="16.48" df="1"/>
        <PCell targetCategory="4" parameterName="p7" beta="-.133" df="1"/>
        <PCell targetCategory="4" parameterName="p8" beta="-.16" df="1"/>
        <PCell targetCategory="5" parameterName="p0" beta="5.862" df="1"/>
        <PCell targetCategory="5" parameterName="p1" beta="16.437" df="1"/>
        <PCell targetCategory="5" parameterName="p3" beta="-17.309" df="1"/>
        <PCell targetCategory="5" parameterName="p5" beta="15.888" df="1"/>
        <PCell targetCategory="5" parameterName="p7" beta="-.105" df="1"/>
        <PCell targetCategory="5" parameterName="p8" beta="6.914E-02" df="1"/>
        <PCell targetCategory="6" parameterName="p0" beta="6.495" df="1"/>
        <PCell targetCategory="6" parameterName="p1" beta="17.297" df="1"/>
        <PCell targetCategory="6" parameterName="p3" beta="-19.098" df="1"/>
        <PCell targetCategory="6" parameterName="p5" beta="16.841" df="1"/>
        <PCell targetCategory="6" parameterName="p7" beta="-.141" df="1"/>
        <PCell targetCategory="6" parameterName="p8" beta="-5.058E-02" df="1"/>
      </ParamMatrix>

    </GeneralRegressionModel>

  </PMML>

Scoring Algorithm

We will use the above example to illustrate the steps that should be followed in the scoring process. Say the following case (observation) must be scored:

         obs = (sex=1 minority=0 age=25 work=4)

Do model file parsing. Reconstruct the PPMatrix and the Parameter matrix.
To score a case, construct the vector x (of length equal to the number of Parameters in the model) as follows.
- If row i of the PP correlation matrix is empty, that means the i-th parameter is an intercept, set x_i = 1.
- If row of the PP correlation matrix is nonempty and corresponds to a factor value or set of factor values, set x_i to 1 if the case being scored matches this row, 0 if it does not.
- If row i of the PP correlation matrix is nonempty and corresponds to a covariate c, with the entry r, then r is the multiplicity of the covariate c in the parameter, so set x_i=c^r using the value of c in the record.
- If row i of the PP correlation matrix is nonempty and corresponds to a number of covariates, then set x_i to be the product of covariate values (from the record) using their corresponding multiplicities found in the PP matrix row.
- Finally, if row i of the PP correlation matrix is nonempty and corresponds to a combination of factors and covariates, then set x_i to be the product of covariate values (from the record) using their corresponding multiplicities found in the PP matrix if the factor values in the record match those in the PP matrix row and 0 otherwise.
Now for each response category (value of the target variable) j, let β_j be the vector of Parameter estimates for that response category. (If k is the last response category, remember that by convention β _k= 0.) Set r _j= <x,β_j > and s _j= exp r_i. The probability that our case falls into category j is then p _j= s_j/ (s₁ + ... + s _k).
If you just want to assign each case to the category into which it has the highest probability of falling, it is not necessary to compute anything after r_j; the category whose r_j value is highest is the one you want. If you want to compute the actual probabilities (for instance, in order to know whether you are assigning a case to a 51% good or a 99% good category), we use a little dodge to avoid overflow. Namely, p_j is the reciprocal of exp (r₁-r_j ) +... + exp (r_k-r_j). If r_i-r_j> 700 for any i, then the exponential will overflow; but in this case P_j is so small that we can set it to zero. Underflow in the denominator can be ignored since the term exp (r_j-r_j) ensures the denominator is at least 1.

General Regression Samples: General Linear Example

The information about the variables is the same as in the previous example, but now the target variable JOBCAT is considered to be continuous.

The Predictor-to-Parameter combinations mapping is the same as above. The corresponding XML model is:


  <?xml version="1.0" ?>
  <PMML version="3.2" xmlns="https://www.dmg.org/PMML-3_2" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
    <Header copyright="dmg.org"/>
    <DataDictionary numberOfFields="5">
      <DataField name="jobcat" optype="continuous" dataType="double"/>
      <DataField name="minority" optype="continuous" dataType="double"/>
      <DataField name="sex" optype="continuous" dataType="double"/>
      <DataField name="age" optype="continuous" dataType="double"/>
      <DataField name="work" optype="continuous" dataType="double"/>
    </DataDictionary>

    <GeneralRegressionModel
          targetVariableName="jobcat"
          modelType="generalLinear"
          functionName="regression">

      <MiningSchema>
        <MiningField name="jobcat" usageType="predicted"/>
        <MiningField name="minority" usageType="active"/>
        <MiningField name="sex" usageType="active"/>
        <MiningField name="age" usageType="active"/>
        <MiningField name="work" usageType="active"/>
      </MiningSchema>

      <ParameterList>
        <Parameter name="p0" label="Intercept"/>
        <Parameter name="p1" label="[SEX=0]"/>
        <Parameter name="p2" label="[SEX=1]"/>
        <Parameter name="p3" label="[MINORITY=0]([SEX=0])"/>
        <Parameter name="p4" label="[MINORITY=1]([SEX=0])"/>
        <Parameter name="p5" label="[MINORITY=0]([SEX=1])"/>
        <Parameter name="p6" label="[MINORITY=1]([SEX=1])"/>
        <Parameter name="p7" label="age"/>
        <Parameter name="p8" label="work"/>
      </ParameterList>

      <FactorList>
        <Predictor name="sex" />
        <Predictor name="minority" />
      </FactorList>

      <CovariateList>
        <Predictor name="age" />
        <Predictor name="work" />
      </CovariateList>

      <PPMatrix>
        <PPCell value="0" predictorName="sex" parameterName="p1"/>
        <PPCell value="1" predictorName="sex" parameterName="p2"/>
        <PPCell value="0" predictorName="sex" parameterName="p3"/>
        <PPCell value="0" predictorName="sex" parameterName="p4"/>
        <PPCell value="1" predictorName="sex" parameterName="p5"/>
        <PPCell value="1" predictorName="sex" parameterName="p6"/>
        <PPCell value="0" predictorName="minority" parameterName="p3"/>
        <PPCell value="1" predictorName="minority" parameterName="p4"/>
        <PPCell value="0" predictorName="minority" parameterName="p5"/>
        <PPCell value="1" predictorName="minority" parameterName="p6"/>
        <PPCell value="1" predictorName="age" parameterName="p7"/>
        <PPCell value="1" predictorName="work" parameterName="p8"/>
      </PPMatrix>

      <ParamMatrix>
        <PCell  parameterName="p0" beta="1.602" df="1"/>
        <PCell  parameterName="p1" beta="0.580" df="1"/>
        <PCell  parameterName="p3" beta="0.831" df="1"/>
        <PCell  parameterName="p5" beta="0.429" df="1"/>
        <PCell  parameterName="p7" beta="-0.012" df="1"/>
        <PCell  parameterName="p8" beta="0.010"  df="1"/>
      </ParamMatrix>

    </GeneralRegressionModel>

  </PMML>

Scoring Algorithm

For this example the steps that should be followed in the scoring process are similar to the previous one but fewer. Say the following case (observation) must be scored:

         obs = (sex=1 minority=0 age=25 work=4)

Do model file parsing. Reconstruct the PPMatrix and the Parameter matrix.
To score a case, construct the vector x (of length equal to the number of Parameters in the model) as follows.
- If row i of the PP correlation matrix is empty, set x _i = 1.
- If row of the PP correlation matrix is nonempty and corresponds to a factor value or set of factor values, set x _i to 1 if the case being scored matches this row, 0 if it does not.
- If row i of the PP correlation matrix is nonempty and corresponds to a covariate c, with the entry r, then r is the multiplicity of the covariate c in the parameter, so set x_i=c^r using the value of c in the record.
- If row i of the PP correlation matrix is nonempty and corresponds to a number of covariates, then set x_i to be the product of covariate values (from the record) using their corresponding multiplicities found in the PP matrix row.
- Finally, if row i of the PP correlation matrix is nonempty and corresponds to a combination of factors and covariates, then set x_i to be the product of covariate values (from the record) using their corresponding multiplicities found in the PP matrix if the factor values in the record match those in the PP matrix row and 0 otherwise.
Now let β be the vector of Parameter estimates. The inner product r = <x,β> is the predicted value for the considered case.

General Regression Samples: Ordinal Multinomial Example

The information about the variables is the same as in the previous examples, but now the target variable JOBCAT is considered to be ordinal.

The Predictor-to-Parameter combinations mapping is the same as above. The corresponding XML model is:


  <?xml version="1.0" ?>
  <PMML version="3.2" xmlns="https://www.dmg.org/PMML-3_2" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
    <Header copyright="dmg.org"/>
    <DataDictionary numberOfFields="5">
      <DataField name="jobcat" optype="continuous" dataType="double"/>
      <DataField name="minority" optype="continuous" dataType="double"/>
      <DataField name="sex" optype="continuous" dataType="double"/>
      <DataField name="age" optype="continuous" dataType="double"/>
      <DataField name="work" optype="continuous" dataType="double"/>
    </DataDictionary>

    <GeneralRegressionModel
          targetVariableName="jobcat"
          modelType="ordinalMultinomial"
          functionName="classification"
          cumulativeLink="logit">

      <MiningSchema>
        <MiningField name="jobcat" usageType="predicted"/>
        <MiningField name="minority" usageType="active"/>
        <MiningField name="sex" usageType="active"/>
        <MiningField name="age" usageType="active"/>
        <MiningField name="work" usageType="active"/>
      </MiningSchema>

      <ParameterList>
        <Parameter name="p0" label="Intercept"/>
        <Parameter name="p1" label="[SEX=0]"/>
        <Parameter name="p2" label="[SEX=1]"/>
        <Parameter name="p3" label="[MINORITY=0]([SEX=0])"/>
        <Parameter name="p4" label="[MINORITY=1]([SEX=0])"/>
        <Parameter name="p5" label="[MINORITY=0]([SEX=1])"/>
        <Parameter name="p6" label="[MINORITY=1]([SEX=1])"/>
        <Parameter name="p7" label="age"/>
        <Parameter name="p8" label="work"/>
      </ParameterList>

      <FactorList>
        <Predictor name="sex" />
        <Predictor name="minority" />
      </FactorList>

      <CovariateList>
        <Predictor name="age" />
        <Predictor name="work" />
      </CovariateList>

      <PPMatrix>
        <PPCell value="0" predictorName="sex" parameterName="p1"/>
        <PPCell value="1" predictorName="sex" parameterName="p2"/>
        <PPCell value="0" predictorName="sex" parameterName="p3"/>
        <PPCell value="0" predictorName="sex" parameterName="p4"/>
        <PPCell value="1" predictorName="sex" parameterName="p5"/>
        <PPCell value="1" predictorName="sex" parameterName="p6"/>
        <PPCell value="0" predictorName="minority" parameterName="p3"/>
        <PPCell value="1" predictorName="minority" parameterName="p4"/>
        <PPCell value="0" predictorName="minority" parameterName="p5"/>
        <PPCell value="1" predictorName="minority" parameterName="p6"/>
        <PPCell value="1" predictorName="age" parameterName="p7"/>
        <PPCell value="1" predictorName="work" parameterName="p8"/>
      </PPMatrix>

      <ParamMatrix>
        <PCell targetCategory="1" parameterName="p0" beta="-0.683" df="1"/>
        <PCell targetCategory="2" parameterName="p0" beta="0.723" df="1"/>
        <PCell targetCategory="3" parameterName="p0" beta="1.104" df="1"/>
        <PCell targetCategory="4" parameterName="p0" beta="1.922" df="1"/>
        <PCell targetCategory="5" parameterName="p0" beta="3.386" df="1"/>
        <PCell targetCategory="6" parameterName="p0" beta="4.006" df="1"/>
        <PCell  parameterName="p1" beta="1.096" df="1"/>
        <PCell  parameterName="p3" beta="0.957" df="1"/>
        <PCell  parameterName="p5" beta="1.149" df="1"/>
        <PCell  parameterName="p7" beta="-0.067" df="1"/>
        <PCell  parameterName="p8" beta="0.060"  df="1"/>
      </ParamMatrix>

    </GeneralRegressionModel>

  </PMML>

Scoring Algorithm

For this example the steps that should be followed in the scoring process are somewhat similar to the first example but also the link function is used. Say the following case (observation) must be scored:

         obs = (sex=1 minority=0 age=25 work=4)

Do model file parsing. Reconstruct the PPMatrix and the Parameter matrix.
To score a case, construct the vector x (of length equal to the number of Parameters in the model) as follows.
- If row i of the PP correlation matrix is empty, set x _i= 1.
- If row of the PP correlation matrix is nonempty and corresponds to a factor value or set of factor values, set x _i to 1 if the case being scored matches this row, 0 if it does not.
- If row i of the PP correlation matrix is nonempty and corresponds to a covariate c, with the entry r, then r is the multiplicity of the covariate c in the parameter, so set x_i=c^r using the value of c in the record.
- If row i of the PP correlation matrix is nonempty and corresponds to a number of covariates, then set x_i to be the product of covariate values (from the record) using their corresponding multiplicities found in the PP matrix row.
- Finally, if row i of the PP correlation matrix is nonempty and corresponds to a combination of factors and covariates, then set x_i to be the product of covariate values (from the record) using their corresponding multiplicities found in the PP matrix if the factor values in the record match those in the PP matrix row and 0 otherwise.
Obtain the values of the offsetVariable or offsetValue a.
Set
- a = value from the observation information if offsetVariable is used
- a = offsetValue from the XML file if offsetValue is used
- a = 0 otherwise.
When the target variable has only two categories, the inverse of link function transforms the value predicted by the regression equation into the corresponding probability of the first target category. If target variable is ordinal with more than two categories, a different intercept parameter value is specified by the model for each target category except the last. Inverse of the link function transforms value predicted by the regression equation with specified intercept value into the corresponding cumulative probability for the given category.

How to compute p_j := probability of target=Value_j

For each response category (value of the target variable) j, let β_j be the vector of Parameter estimates for that response category. (If k is the last response category, β_k is not specified.) For the given case let <x,β_j> be the result of evaluating the inner product just like in the multinomialLogistic model and y_j = <x,β_j> + a. Predicted probability for each category is then computed according to the following formulas:

p₁ = F(y₁)
p_j = F(y_j) - F(y_j-1) , for 2 ≤ j < k
p_k = 1 - F(y_k-1)

Function F is an inverse of the specified link function:

logit, ordinal
inverse of logit function: F(y)= 1/(1+exp(-y)).

probit, ordinal
inverse of probit function: F(y)= integral(from -∞ to y)(1/sqrt(2*π))exp(-0.5*u*u)du.

cloglog, ordinal
inverse of cloglog function: F(y)= 1 - exp( -exp(y) ).

loglog, ordinal
inverse of loglog function: F(y)= exp( -exp(-y) ).

cauchit, ordinal
inverse of cauchit function: F(y)= 0.5 + (1/π) arctan(y).

General Regression Samples: Simple Regression Example

Only two continuous predictors are used in this example, and the target variable JOBCAT is considered to be continuous.

The Predictor-to-Parameter combinations mapping is trivial. The corresponding XML model is:


  <?xml version="1.0" ?>
  <PMML version="3.2" xmlns="https://www.dmg.org/PMML-3_2" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
    <Header copyright="dmg.org"/>
    <DataDictionary numberOfFields="5">
      <DataField name="jobcat" optype="continuous" dataType="double"/>
      <DataField name="minority" optype="continuous" dataType="double"/>
      <DataField name="sex" optype="continuous" dataType="double"/>
      <DataField name="age" optype="continuous" dataType="double"/>
      <DataField name="work" optype="continuous" dataType="double"/>
    </DataDictionary>

    <GeneralRegressionModel
          targetVariableName="jobcat"
          modelType="regression"
          functionName="regression">

      <MiningSchema>
        <MiningField name="jobcat" usageType="predicted"/>
        <MiningField name="age" usageType="active"/>
        <MiningField name="work" usageType="active"/>
      </MiningSchema>

      <ParameterList>
        <Parameter name="p0" label="Intercept"/>
        <Parameter name="p1" label="age"/>
        <Parameter name="p2" label="work"/>
      </ParameterList>

      <CovariateList>
        <Predictor name="age" />
        <Predictor name="work" />
      </CovariateList>

      <PPMatrix>
        <PPCell value="1" predictorName="age" parameterName="p1"/>
        <PPCell value="1" predictorName="work" parameterName="p2"/>
      </PPMatrix>

      <ParamMatrix>
        <PCell  parameterName="p0" beta="2.922" df="1"/>
        <PCell  parameterName="p1" beta="-0.031" df="1"/>
        <PCell  parameterName="p2" beta="0.034" df="1"/>
      </ParamMatrix>

    </GeneralRegressionModel>

  </PMML>

Scoring Algorithm

For this example the steps that should be followed in the scoring process are somewhat similar to the general linear example but are even simpler. Say the following case (observation) must be scored:

         obs = (age=25 work=4)

Do model file parsing. Reconstruct the PPMatrix and the Parameter matrix.
To score a case, construct the vector x (of length equal to the number of Parameters in the model) as follows.
- If row i of the PP correlation matrix is empty, set x _i = 1.
- If row i of the PP correlation matrix is nonempty and corresponds to a covariate c, the row should contain exactly one nonzero entry, in the column corresponding to the independent variable c. The value of this entry should be 1 since it is a linear model, so set x_i=x (using the value of c which appears in this case).
Now let β be the vector of Parameter estimates. The inner product r = <x,β> is the predicted value for the considered case.

General Regression Samples: Generalized Linear Model Example

The information about the variables is the same as in the previous examples, but now the target variable JOBCAT is considered to be continuous.

The Predictor-to-Parameter combinations mapping is the same as above. The corresponding XML model is:


  <?xml version="1.0" ?>
  <PMML version="3.2" xmlns="https://www.dmg.org/PMML-3_2" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
    <Header copyright="dmg.org"/>
    <DataDictionary numberOfFields="5">
      <DataField name="jobcat" optype="continuous" dataType="double"/>
      <DataField name="minority" optype="continuous" dataType="double"/>
      <DataField name="sex" optype="continuous" dataType="double"/>
      <DataField name="age" optype="continuous" dataType="double"/>
      <DataField name="work" optype="continuous" dataType="double"/>
    </DataDictionary>

    <GeneralRegressionModel
          targetVariableName="jobcat"
          modelType="generalLinear"
          modelName="GZLM"
          functionName="regression"
          distribution="gamma"
          linkFunction="power"
          linkParameter="-1"
          offsetValue="3">

      <MiningSchema>
        <MiningField name="jobcat" usageType="predicted"/>
        <MiningField name="minority" usageType="active"/>
        <MiningField name="sex" usageType="active"/>
        <MiningField name="age" usageType="active"/>
        <MiningField name="work" usageType="active"/>
      </MiningSchema>

      <ParameterList>
        <Parameter name="p0" label="Intercept"/>
        <Parameter name="p1" label="[SEX=0]"/>
        <Parameter name="p2" label="[SEX=1]"/>
        <Parameter name="p3" label="[MINORITY=0]([SEX=0])"/>
        <Parameter name="p4" label="[MINORITY=1]([SEX=0])"/>
        <Parameter name="p5" label="[MINORITY=0]([SEX=1])"/>
        <Parameter name="p6" label="[MINORITY=1]([SEX=1])"/>
        <Parameter name="p7" label="age"/>
        <Parameter name="p8" label="work"/>
      </ParameterList>

      <FactorList>
        <Predictor name="sex" />
        <Predictor name="minority" />
      </FactorList>

      <CovariateList>
        <Predictor name="age" />
        <Predictor name="work" />
      </CovariateList>

      <PPMatrix>
        <PPCell value="0" predictorName="sex" parameterName="p1"/>
        <PPCell value="1" predictorName="sex" parameterName="p2"/>
        <PPCell value="0" predictorName="sex" parameterName="p3"/>
        <PPCell value="0" predictorName="sex" parameterName="p4"/>
        <PPCell value="1" predictorName="sex" parameterName="p5"/>
        <PPCell value="1" predictorName="sex" parameterName="p6"/>
        <PPCell value="0" predictorName="minority" parameterName="p3"/>
        <PPCell value="1" predictorName="minority" parameterName="p4"/>
        <PPCell value="0" predictorName="minority" parameterName="p5"/>
        <PPCell value="1" predictorName="minority" parameterName="p6"/>
        <PPCell value="1" predictorName="age" parameterName="p7"/>
        <PPCell value="1" predictorName="work" parameterName="p8"/>
      </PPMatrix>

      <ParamMatrix>
        <PCell  parameterName="p0" beta="-2.30824444845005" df="1"/>
        <PCell  parameterName="p1" beta="-0.268177596945098" df="1"/>
        <PCell  parameterName="p3" beta="-0.169104566719988" df="1"/>
        <PCell  parameterName="p5" beta="-0.219215962160056" df="1"/>
        <PCell  parameterName="p7" beta="0.00427629446211706" df="1"/>
        <PCell  parameterName="p8" beta="-0.00397117497757107"  df="1"/>
      </ParamMatrix>

    </GeneralRegressionModel>

  </PMML>

Scoring Algorithm

For this example the steps that should be followed in the scoring process are somewhat similar to the second example but also the link function is used. Say the following case (observation) must be scored:

         obs = (sex=1 minority=0 age=25 work=4)

Do model file parsing. Reconstruct the PPMatrix and the Parameter matrix.
To score a case, construct the vector x (of length equal to the number of Parameters in the model) as follows.
- If row i of the PP correlation matrix is empty, set x _i = 1.
- If row of the PP correlation matrix is nonempty and corresponds to a factor value or set of factor values, set x _i to 1 if the case being scored matches this row, 0 if it does not.
- If row i of the PP correlation matrix is nonempty and corresponds to a covariate c, with the entry r, then r is the multiplicity of the covariate c in the parameter, so set x_i=c^r using the value of c in the record.
- If row i of the PP correlation matrix is nonempty and corresponds to a number of covariates, then set x_i to be the product of covariate values (from the record) using their corresponding multiplicities found in the PP matrix row.
- Finally, if row i of the PP correlation matrix is nonempty and corresponds to a combination of factors and covariates, then set x_i to be the product of covariate values (from the record) using their corresponding multiplicities found in the PP matrix if the factor values in the record match those in the PP matrix row and 0 otherwise.
Obtain the values of the offsetVariable or offsetValue (a), TrialsVariable or TrialsValue (b), distParameter (c) and linkParameter (d).
Set
- a = value from the observation information if offsetVariable is used
- a = offsetValue from the XML file if offsetValue is used
- a = 0 otherwise
Set
- b = value from the observation information if TrialsVariable is used
- b = trialsValue from the XML file if TrialsValue is used
- b = 1 otherwise
Set
- c = distParameter from the XML file if link = negbin and distribution = negbin
Set
- d = linkParameter from the XML file if link = oddspower or power.
Let β be the vector of Parameter estimates and <x,β> be the inner product of two vectors x and β. The predicted value for the considered case is F(<x,β> + a)*b,
where function F is an inverse of the specified link function:

cloglog
inverse of cloglog function: F(y) = 1 - exp( -exp(y) ).

identity
inverse of identity function: F(y) = y.

log
inverse of log function: F(y) = exp(y).

logc
inverse of logc function: F(y) = 1 - exp(y).

logit
inverse of logit function: F(y)= 1/(1 + exp(-y)).

loglog
inverse of loglog function: F(y) = exp( -exp(-y) ).

negbin(c)
inverse of negbin(c) function: F(y) = 1/(c(exp(-y) - 1)).

oddspower(d)
inverse of oddspower(d) function:
F(y) = 1/(1 + (1 + d*y)^-1/d) if d!=0;

F(y) = 1/(1 + exp(-y)) if d=0.

power(d)
inverse of power(d) function:
F(y) = y^1/d if d!=0;

F(y) = exp(y) if d=0.

probit
inverse of probit function: F(y) = integral(from -∞ to y)(1/sqrt(2*π))exp(-0.5*u*u)du.

PMML 3.2 - General Regression

CUMULATIVE-LINK-FUNCTION data type

LINK-FUNCTION data type

General Regression Samples: Multinomial Logistic Example

General Regression Samples: General Linear Example

General Regression Samples: Ordinal Multinomial Example

How to compute pj := probability of target=Valuej

General Regression Samples: Simple Regression Example

General Regression Samples: Generalized Linear Model Example

How to compute p_j := probability of target=Value_j