PMML 4.2 - General RegressionModel XSD and Tag Description
<xs:element name="GeneralRegressionModel">
<xs:complexType>
<xs:sequence>
<xs:element minOccurs="0" maxOccurs="unbounded" ref="Extension"/>
<xs:element ref="MiningSchema"/>
<xs:element minOccurs="0" ref="Output"/>
<xs:element minOccurs="0" ref="ModelStats"/>
<xs:element ref="ModelExplanation" minOccurs="0"/>
<xs:element minOccurs="0" ref="Targets"/>
<xs:element minOccurs="0" ref="LocalTransformations"/>
<xs:element ref="ParameterList"/>
<xs:element minOccurs="0" ref="FactorList"/>
<xs:element minOccurs="0" ref="CovariateList"/>
<xs:element ref="PPMatrix"/>
<xs:element minOccurs="0" ref="PCovMatrix"/>
<xs:element ref="ParamMatrix"/>
<xs:element minOccurs="0" ref="EventValues"/>
<xs:element minOccurs="0" ref="BaseCumHazardTables"/>
<xs:element ref="ModelVerification" minOccurs="0"/>
<xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded"/>
</xs:sequence>
<xs:attribute name="targetVariableName" type="FIELD-NAME"/>
<xs:attribute name="modelType" use="required">
<xs:simpleType>
<xs:restriction base="xs:string">
<xs:enumeration value="regression"/>
<xs:enumeration value="generalLinear"/>
<xs:enumeration value="multinomialLogistic"/>
<xs:enumeration value="ordinalMultinomial"/>
<xs:enumeration value="generalizedLinear"/>
<xs:enumeration value="CoxRegression"/>
</xs:restriction>
</xs:simpleType>
</xs:attribute>
<xs:attribute name="modelName" type="xs:string"/>
<xs:attribute name="functionName" type="MINING-FUNCTION" use="required"/>
<xs:attribute name="algorithmName" type="xs:string"/>
<xs:attribute name="targetReferenceCategory" type="xs:string"/>
<xs:attribute name="cumulativeLink" type="CUMULATIVE-LINK-FUNCTION"/>
<xs:attribute name="linkFunction" type="LINK-FUNCTION"/>
<xs:attribute name="linkParameter" type="REAL-NUMBER"/>
<xs:attribute name="trialsVariable" type="FIELD-NAME"/>
<xs:attribute name="trialsValue" type="INT-NUMBER"/>
<xs:attribute name="distribution">
<xs:simpleType>
<xs:restriction base="xs:string">
<xs:enumeration value="binomial"/>
<xs:enumeration value="gamma"/>
<xs:enumeration value="igauss"/>
<xs:enumeration value="negbin"/>
<xs:enumeration value="normal"/>
<xs:enumeration value="poisson"/>
<xs:enumeration value="tweedie"/>
</xs:restriction>
</xs:simpleType>
</xs:attribute>
<xs:attribute name="distParameter" type="REAL-NUMBER"/>
<xs:attribute name="offsetVariable" type="FIELD-NAME"/>
<xs:attribute name="offsetValue" type="REAL-NUMBER"/>
<xs:attribute name="modelDF" type="REAL-NUMBER"/>
<xs:attribute name="endTimeVariable" type="FIELD-NAME"/>
<xs:attribute name="startTimeVariable" type="FIELD-NAME"/>
<xs:attribute name="subjectIDVariable" type="FIELD-NAME"/>
<xs:attribute name="statusVariable" type="FIELD-NAME"/>
<xs:attribute name="baselineStrataVariable" type="FIELD-NAME"/>
<xs:attribute name="isScorable" type="xs:boolean" default="true"/>
</xs:complexType>
</xs:element>
<xs:element name="ParameterList">
<xs:complexType>
<xs:sequence>
<xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded"/>
<xs:element ref="Parameter" minOccurs="0" maxOccurs="unbounded"/>
</xs:sequence>
</xs:complexType>
</xs:element>
<xs:element name="Parameter">
<xs:complexType>
<xs:sequence>
<xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded"/>
</xs:sequence>
<xs:attribute name="name" type="xs:string" use="required"/>
<xs:attribute name="label" type="xs:string"/>
<xs:attribute name="referencePoint" type="REAL-NUMBER" default="0"/>
</xs:complexType>
</xs:element>
<xs:element name="FactorList">
<xs:complexType>
<xs:sequence>
<xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded"/>
<xs:element minOccurs="0" maxOccurs="unbounded" ref="Predictor"/>
</xs:sequence>
</xs:complexType>
</xs:element>
<xs:element name="CovariateList">
<xs:complexType>
<xs:sequence>
<xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded"/>
<xs:element minOccurs="0" maxOccurs="unbounded" ref="Predictor"/>
</xs:sequence>
</xs:complexType>
</xs:element>
<xs:element name="Predictor">
<xs:complexType>
<xs:sequence>
<xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded"/>
<xs:element ref="Categories" minOccurs="0" maxOccurs="1"/>
<xs:element ref="Matrix" minOccurs="0"/>
</xs:sequence>
<xs:attribute name="name" type="FIELD-NAME" use="required"/>
<xs:attribute name="contrastMatrixType" type="xs:string"/>
</xs:complexType>
</xs:element>
<xs:element name="Categories">
<xs:complexType>
<xs:sequence>
<xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded"/>
<xs:element ref="Category" minOccurs="1" maxOccurs="unbounded"/>
</xs:sequence>
</xs:complexType>
</xs:element>
<xs:element name="Category">
<xs:complexType>
<xs:sequence>
<xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded"/>
</xs:sequence>
<xs:attribute name="value" type="xs:string" use="required"/>
</xs:complexType>
</xs:element>
<xs:element name="PPMatrix">
<xs:complexType>
<xs:sequence>
<xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded"/>
<xs:element ref="PPCell" minOccurs="0" maxOccurs="unbounded"/>
</xs:sequence>
</xs:complexType>
</xs:element>
<xs:element name="PPCell">
<xs:complexType>
<xs:sequence>
<xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded"/>
</xs:sequence>
<xs:attribute name="value" type="xs:string" use="required"/>
<xs:attribute name="predictorName" type="FIELD-NAME" use="required"/>
<xs:attribute name="parameterName" type="xs:string" use="required"/>
<xs:attribute name="targetCategory" type="xs:string"/>
</xs:complexType>
</xs:element>
<xs:element name="PCovMatrix">
<xs:complexType>
<xs:sequence>
<xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded"/>
<xs:element maxOccurs="unbounded" ref="PCovCell"/>
</xs:sequence>
<xs:attribute name="type">
<xs:simpleType>
<xs:restriction base="xs:string">
<xs:enumeration value="model"/>
<xs:enumeration value="robust"/>
</xs:restriction>
</xs:simpleType>
</xs:attribute>
</xs:complexType>
</xs:element>
<xs:element name="PCovCell">
<xs:complexType>
<xs:sequence>
<xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded"/>
</xs:sequence>
<xs:attribute name="pRow" type="xs:string" use="required"/>
<xs:attribute name="pCol" type="xs:string" use="required"/>
<xs:attribute name="tRow" type="xs:string"/>
<xs:attribute name="tCol" type="xs:string"/>
<xs:attribute name="value" type="REAL-NUMBER" use="required"/>
<xs:attribute name="targetCategory" type="xs:string"/>
</xs:complexType>
</xs:element>
<xs:element name="ParamMatrix">
<xs:complexType>
<xs:sequence>
<xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded"/>
<xs:element ref="PCell" minOccurs="0" maxOccurs="unbounded"/>
</xs:sequence>
</xs:complexType>
</xs:element>
<xs:element name="PCell">
<xs:complexType>
<xs:sequence>
<xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded"/>
</xs:sequence>
<xs:attribute name="targetCategory" type="xs:string"/>
<xs:attribute name="parameterName" type="xs:string" use="required"/>
<xs:attribute name="beta" type="REAL-NUMBER" use="required"/>
<xs:attribute name="df" type="INT-NUMBER"/>
</xs:complexType>
</xs:element>
<xs:element name="BaseCumHazardTables">
<xs:complexType>
<xs:sequence>
<xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded"/>
<xs:choice>
<xs:element maxOccurs="unbounded" ref="BaselineStratum"/>
<xs:element maxOccurs="unbounded" ref="BaselineCell"/>
</xs:choice>
</xs:sequence>
<xs:attribute name="maxTime" type="REAL-NUMBER" use="optional"/>
</xs:complexType>
</xs:element>
<xs:element name="BaselineStratum">
<xs:complexType>
<xs:sequence>
<xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded"/>
<xs:element minOccurs="0" maxOccurs="unbounded" ref="BaselineCell"/>
</xs:sequence>
<xs:attribute name="value" type="xs:string" use="required"/>
<xs:attribute name="label" type="xs:string"/>
<xs:attribute name="maxTime" type="REAL-NUMBER" use="required"/>
</xs:complexType>
</xs:element>
<xs:element name="BaselineCell">
<xs:complexType>
<xs:sequence>
<xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded"/>
</xs:sequence>
<xs:attribute name="time" type="REAL-NUMBER" use="required"/>
<xs:attribute name="cumHazard" type="REAL-NUMBER" use="required"/>
</xs:complexType>
</xs:element>
<xs:element name="EventValues">
<xs:complexType>
<xs:sequence>
<xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded"/>
<xs:element minOccurs="0" maxOccurs="unbounded" ref="Value"/>
<xs:element minOccurs="0" maxOccurs="unbounded" ref="Interval"/>
</xs:sequence>
</xs:complexType>
</xs:element>
GeneralRegressionModel: marks the beginning of a general
regression model. As the name says it, this is intended to support a
multitude of regression models.
ParameterList: lists all Parameters.
Each Parameter contains a required name and optional label.
Parameter names should be unique within the model and as brief as possible
(since Parameter names appear frequently in the document). The label, if
present, is meant to give a hint on a Parameter's correlation with the
Predictors. The optional attribute referencePoint is used in Cox regression
models only and has a default value of 0. ParameterList can be
empty only for CoxRegression models, for other models at least one
Parameter should be present.
FactorList: list of factor (categorical
predictor) names. Not present if this particular regression flavor does not
support factors (ex. linear regression). If present, the list may or may
not be empty. Each name in the list must match a DataField name or a
DerivedField name. The factors must be categorical variables.
Predictor: describes a categorical (factor)
or a continuous (covariate) predictor for the model. When describing a
factor, it can optionally contain a list of categories and a contrast
matrix. Such matrix describes the codings of categorical variables. If a
categorical variable has n values, there will be n rows
and n-1 or n columns in the matrix. The rows and columns
correspond to the categories of the factor in the order listed in the
Category element if it is present, otherwise in the order listed in
the DataField or DerivedField element. If the
Categories element is present and the corresponding
DataField or DerivedField element has a list of valid
categories, then the list in Categories should be a subset of that
in DataField or DerivedField. A contrast matrix with
n-1 columns helps to reduce the total number of parameters in the
model. The use of such a matrix during scoring is described below.
CovariateList: list of covariate names.
Will not be present when there is no covariate. Each name in the list must
match a DataField name or a DerivedField name. The covariates will be
treated as continuous variables.
targetVariableName: name of the target
variable (also called response variable). This attribute
has been deprecated since PMML 3.0. If present, it should match
the name of the target MiningField.
modelType: specifies the type of regression
model in use. This information will be used to select the appropriate
mathematical formulas during scoring. The supported regression algorithms
are listed.
modelName and algorithmName can have
arbitrary strings describing the specific model.
functionName can only be
classification or regression.
targetReferenceCategory can be used for specifying the reference
category of the target variable in a multinomial classification model.
Normally the reference category is the one from DataDictionary that does
not appear in the ParamMatrix, but when several models are combined in one
PMML file an explicit specification is needed.
cumulativeLink: specifies the type of cumulative link function to
use when ordinalMultinomial model type is specified.
isScorable: This attribute indicates if the model is valid for
scoring. If this attribute is true or if it is missing, then the model
should be processed normally. However, if the attribute is false, then the
model producer has indicated that this model is intended for information
purposes only and should not be used to generate results. In order to be
valid PMML, all required elements and attributes must be present, even for
non-scoring models. For more details, see General Structure.
cumulativeLink: specifies the type of cumulative link function to
use when ordinalMultinomial model type is specified.
CUMULATIVE-LINK-FUNCTION data type
The following definition is used for specifying a cumulative link function used in
ordinalMultinomial model.
<xs:simpleType name="CUMULATIVE-LINK-FUNCTION">
<xs:restriction base="xs:string">
<xs:enumeration value="logit"/>
<xs:enumeration value="probit"/>
<xs:enumeration value="cloglog"/>
<xs:enumeration value="loglog"/>
<xs:enumeration value="cauchit"/>
</xs:restriction>
</xs:simpleType>
Specific formulas are listed below in the scoring example.
linkFunction: specifies the type of link function to use when
generalizedLinear model type is specified.
LINK-FUNCTION data type
The following definition is used for specifying a link function used in generalizedLinear model.
<xs:simpleType name="LINK-FUNCTION">
<xs:restriction base="xs:string">
<xs:enumeration value="cloglog"/>
<xs:enumeration value="identity"/>
<xs:enumeration value="log"/>
<xs:enumeration value="logc"/>
<xs:enumeration value="logit"/>
<xs:enumeration value="loglog"/>
<xs:enumeration value="negbin"/>
<xs:enumeration value="oddspower"/>
<xs:enumeration value="power"/>
<xs:enumeration value="probit"/>
</xs:restriction>
</xs:simpleType>
Specific formulas are listed below in the scoring example.
linkParameter: specifies an additional number the following link
functions need: oddspower and power.
trialsVariable: specifies an additional variable used during
scoring some generalizedLinear models (see the description of scoring
procedure below). This attribute must refer to a DataField or a
DerivedField.
trialsValue: a positive integer used during scoring some
generalizedLinear models (see the description of scoring procedure below).
At most one of the attributes trialsVariable and trialsValue can be present
in a model.
distribution: the probability distribution of the dependent
variable for generalizedLinear model may be specified as normal, binomial,
gamma, inverse Gaussian, negative binomial, or Poisson.
distParameter: specifies an ancillary parameter value for the
negative binomial distribution.
offsetVariable: if present, this variable is used during scoring
generalizedLinear, ordinalMultinomial, or multinomialLogistic models (see
the description of scoring procedures below). This attribute must refer to
a DataField or a DerivedField.
offsetValue: if present, this value is used during scoring
generalizedLinear, ordinalMultinomial, or multinomialLogistic models. It
works like a user-specified intercept (see the description of the scoring
procedures below). At most one of the attributes offsetVariable and
offsetValue can be present in a model.
modelDF: the value of degrees of freedom for the model. This
value is needed for computing confidence intervals for predicted
values.
endTimeVariable: if modelType is CoxRegression, this variable is
required during scoring (see the description of scoring procedures below).
This attribute must refer to a DataField or a DerivedField containing a
continuous variable.
startTimeVariable: if modelType is CoxRegression, this variable
is optional, it is not used during scoring but is an important piece of
information about model building. This attribute must refer to a DataField
or a DerivedField containing a continuous variable.
subjectIDVariable: if modelType is CoxRegression, this variable
is optional, it is not used during scoring but is an important piece of
information about model building. This attribute must refer to a DataField
or a DerivedField. Explicitly listing all categories of this variable is
not recommended.
statusVariable: if modelType is CoxRegression, this variable is
required during scoring (see the description of scoring procedures below).
This attribute must refer to a DataField or a DerivedField.
baselineStrataVariable: if modelType is CoxRegression, this
variable is optional, if present it is used during scoring (see the
description of scoring procedures below). This attribute must refer to a
DataField or a DerivedField containing a categorical variable.
PPMatrix: Predictor-to-Parameter correlation
matrix. It is a rectangular matrix having a column for each Predictor
(factor or covariate) and a row for each Parameter. The matrix is
represented as a sequence of cells, each cell containing a number
representing the correlation between the Predictor and the Parameter. The
cell values are computed as follows:
- For each Predictor variable v and each Parameter p, the corresponding
cell value is missing (empty) if there is no correlation between v and
p.
- If there is a correlation between a covariate Predictor and the
Parameter, the cell value is set to the exponent that the covariate is
raised to in the dependency expression. Example: assuming variable jobcat
is a factor and work is a covariate, the Parameter
[jobcat=professional] * work * work is correlated to the
covariate work, and the number that should be entered in the cell is 2
because work is present at second power in the expression.
- If there is a correlation between a factor variable and the
Parameter, the cell value is set to the Predictor value that determines
the correlation. Example: Assuming the categories of the factor variable
jobcat are professional, clerical, skilled,
unskilled, the cell in the matrix that corresponds to
(jobcat, jobcat=skilled) has a value of skilled.
The empty cells are not required to be present in the exported model
file. All cells determined to be missing from the xml file at model parsing
will be assumed to be empty. Since empty cells make up a large chunk of the
matrix, this will reduce the size of the exported model. Note that PPMatrix
can be empty if a model is intercept-only or Cox regression without
parameters.
Note the implied targetCategory attribute. This is permitted in order to
allow usage of different PPMatrices for different response values in
classification models. For multinomialLogistic model if any PPCell contains
this attribute, the expectation is that for that particular response level,
a full PPMatrix can be reconstructed from the PMML document. It is that
matrix which will be used during scoring in order to get the probability
(and other statistics) for the response level. By default, all target
categories share the PPMatrix.
targetCategory attribute can thus be used to override the default
for some or all target categories.
PPCell: cell in the PPMatrix. Knows its row
name, column name, and information as described above.
PCovMatrix: matrix of Parameter estimate covariances. Made up of
PCovCells, each of them being located via row information for Parameter
name (pRow), row information for target variable value (tRow), column
information for Parameter name (pCol) and column information for target
variable value (tCol). Note that the matrix is symmetric with respect to
the main diagonal (interchanging tRow and tCol together with pRow and pCol
will not change the value). Therefore it is sufficient that only half of
the matrix be exported. Attributes tRow and tCol are optional since they
are not needed for linear regression models. This element has an optional
attribute type that can take values model and
robust. This attribute describes the way the covariance matrix was
computed in generalizedLinear model. The robust option is also
known as Huber-White or sandwich or HCCM.
ParamMatrix: Parameter matrix. A table containing the Parameter
values along with associated statistics (degrees of freedom). One dimension
has the target variable's categories, the other has the Parameter names.
The table is represented by specifying each cell. There is no requirement
for Parameter names other than that each name should uniquely identify one
Parameter.
PCell: cell in the ParamMatrix. The optional
targetCategory and required parameterName attributes
determine the cell's location in the Parameter matrix. The
information contained is: beta (actual Parameter value, required),
and df (degrees of freedom, optional). For
ordinalMultinomial model ParamMatrix specifies different
values for the intercept parameter: one for each target category
except one. Values for all other parameters are constant across all target
variable values.
EventValues contains a list of Value and/or
Interval elements that describe values of the status variable in
Cox Regression model corresponding to the "Event". Please see example of
Cox Regression below for explanation.
BaseCumHazardTables: Values of baseline cumulative hazard for Cox
regression. In the presence of baseline strata variable there is a separate
table for each baseline stratum value, otherwise only one table is needed.
There is a value for maximum time for which data was available, and a set
of pairs of time and cumulative hazard values, in BaselineCell
elements.
BaselineCell: cell in the BaseCumHazardTables. The
required time and cumHazard attributes contain all needed
information.
BaselineStratum contains a set of BaselineCells plus the
maximum time for one value of baseline strata variable. The optional
label attribute makes it more human-readable.
General Regression Samples: Multinomial Logistic Example
Here is the information about the variables:
Name |
Type |
Number of categories |
Categories (numeric coding in parentheses) |
JOBCAT |
Target |
7 |
Clerical(1), Office trainee(2), Security officer(3), College
trainee(4), Exempt employee(5), MBA trainee(6), and Technical(7) |
SEX |
Factor |
2 |
Male(0), and Female(1) |
MINORITY |
Factor |
2 |
Non-Minority(0), and Minority(1) |
AGE |
Covariate |
|
|
WORK |
Covariate |
|
|
The Parameter estimates are displayed as follows:
Parameter Estimates |
Employment Categorya |
B |
df |
Clerical |
Intercept |
26.836 |
1 |
[sex=0] |
-.719 |
1 |
[sex=1] |
0b |
0 |
[sex=0] * [minority=0] |
-19.214 |
1 |
[sex=0] * [minority=1] |
0b |
0 |
[sex=1] * [minority=0] |
-.114 |
1 |
[sex=1] * [minority=1] |
0b |
0 |
age |
-.133 |
1 |
work |
7.885E-02 |
1 |
Office trainee |
Intercept |
31.077 |
1 |
[sex=0] |
-0.869 |
1 |
[sex=1] |
0b |
0 |
[sex=0] * [minority=0] |
-18.990 |
1 |
[sex=0] * [minority=1] |
0b |
0 |
[sex=1] * [minority=0] |
1.010 |
1 |
[sex=1] * [minority=1] |
0b |
0 |
age |
-.300 |
1 |
work |
.152 |
1 |
Security officer |
Intercept |
6.836 |
1 |
[sex=0] |
16.305 |
1 |
[sex=1] |
0b |
0 |
[sex=0] * [minority=0] |
-20.041 |
1 |
[sex=0] * [minority=1] |
0b |
0 |
[sex=1] * [minority=0] |
-.730 |
1 |
[sex=1] * [minority=1] |
0b |
0 |
age |
-.156 |
1 |
work |
.267 |
1 |
College trainee |
Intercept |
8.816 |
1 |
[sex=0] |
15.264 |
1 |
[sex=1] |
0b |
0 |
[sex=0] * [minority=0] |
-16.799 |
1 |
[sex=0] * [minority=1] |
0b |
0 |
[sex=1] * [minority=0] |
16.480 |
1 |
[sex=1] * [minority=1] |
0b |
0 |
age |
-.133 |
1 |
work |
-.160 |
1 |
Exempt employee |
Intercept |
5.862 |
1 |
[sex=0] |
16.437 |
1 |
[sex=1] |
0b |
0 |
[sex=0] * [minority=0] |
-17.309 |
1 |
[sex=0] * [minority=1] |
0b |
0 |
[sex=1] * [minority=0] |
15.888 |
1 |
[sex=1] * [minority=1] |
0b |
0 |
age |
-.105 |
1 |
work |
6.914E-02 |
1 |
MBA trainee |
Intercept |
6.495 |
1 |
[sex=0] |
17.297 |
1 |
[sex=1] |
0b |
0 |
[sex=0] * [minority=0] |
-19.098 |
1 |
[sex=0] * [minority=1] |
0b |
0 |
[sex=1] * [minority=0] |
16.841 |
1 |
[sex=1] * [minority=1] |
0b |
0 |
age |
-.141 |
1 |
work |
-5.058E-02 |
1 |
a. The reference category is:
Technical. |
b. This parameter is set to zero because it
is redundant. |
The PPMatrix is:
Parameter |
SEX |
MINORITY |
AGE |
WORK |
Intercept |
|
|
|
|
[SEX = 0] |
0 |
|
|
|
[SEX = 1] |
1 |
|
|
|
[MINORITY = 0]([SEX = 0]) |
0 |
0 |
|
|
[MINORITY = 1]([SEX = 0]) |
0 |
1 |
|
|
[MINORITY = 0]([SEX = 1]) |
1 |
0 |
|
|
[MINORITY = 1]([SEX = 1]) |
1 |
1 |
|
|
AGE |
|
|
1 |
|
WORK |
|
|
|
1 |
This Predictor-to-Parameter combinations mapping is the same for each
target variable category. The corresponding XML model is:
<PMML xmlns="https://www.dmg.org/PMML-4_2" version="4.2">
<Header copyright="dmg.org"/>
<DataDictionary numberOfFields="5">
<DataField name="jobcat" optype="categorical" dataType="double">
<Value value="1" displayValue="Clerical"/>
<Value value="2" displayValue="Office trainee"/>
<Value value="3" displayValue="Security officer"/>
<Value value="4" displayValue="College trainee"/>
<Value value="5" displayValue="Exempt employee"/>
<Value value="6" displayValue="MBA trainee"/>
<Value value="7" displayValue="Technical"/>
</DataField>
<DataField name="minority" optype="categorical" dataType="double">
<Value value="0" displayValue="Non-Minority"/>
<Value value="1" displayValue="Minority"/>
</DataField>
<DataField name="sex" optype="categorical" dataType="double">
<Value value="0" displayValue="Male"/>
<Value value="1" displayValue="Female"/>
</DataField>
<DataField name="age" optype="continuous" dataType="double"/>
<DataField name="work" optype="continuous" dataType="double"/>
</DataDictionary>
<GeneralRegressionModel modelType="multinomialLogistic" functionName="classification" targetReferenceCategory="7">
<MiningSchema>
<MiningField name="jobcat" usageType="target"/>
<MiningField name="minority" usageType="active"/>
<MiningField name="sex" usageType="active"/>
<MiningField name="age" usageType="active"/>
<MiningField name="work" usageType="active"/>
</MiningSchema>
<ParameterList>
<Parameter name="p0" label="Intercept"/>
<Parameter name="p1" label="[SEX=0]"/>
<Parameter name="p2" label="[SEX=1]"/>
<Parameter name="p3" label="[MINORITY=0]([SEX=0])"/>
<Parameter name="p4" label="[MINORITY=1]([SEX=0])"/>
<Parameter name="p5" label="[MINORITY=0]([SEX=1])"/>
<Parameter name="p6" label="[MINORITY=1]([SEX=1])"/>
<Parameter name="p7" label="age"/>
<Parameter name="p8" label="work"/>
</ParameterList>
<FactorList>
<Predictor name="sex"/>
<Predictor name="minority"/>
</FactorList>
<CovariateList>
<Predictor name="age"/>
<Predictor name="work"/>
</CovariateList>
<PPMatrix>
<PPCell value="0" predictorName="sex" parameterName="p1"/>
<PPCell value="1" predictorName="sex" parameterName="p2"/>
<PPCell value="0" predictorName="sex" parameterName="p3"/>
<PPCell value="0" predictorName="sex" parameterName="p4"/>
<PPCell value="1" predictorName="sex" parameterName="p5"/>
<PPCell value="1" predictorName="sex" parameterName="p6"/>
<PPCell value="0" predictorName="minority" parameterName="p3"/>
<PPCell value="1" predictorName="minority" parameterName="p4"/>
<PPCell value="0" predictorName="minority" parameterName="p5"/>
<PPCell value="1" predictorName="minority" parameterName="p6"/>
<PPCell value="1" predictorName="age" parameterName="p7"/>
<PPCell value="1" predictorName="work" parameterName="p8"/>
</PPMatrix>
<ParamMatrix>
<PCell targetCategory="1" parameterName="p0" beta="26.836" df="1"/>
<PCell targetCategory="1" parameterName="p1" beta="-.719" df="1"/>
<PCell targetCategory="1" parameterName="p3" beta="-19.214" df="1"/>
<PCell targetCategory="1" parameterName="p5" beta="-.114" df="1"/>
<PCell targetCategory="1" parameterName="p7" beta="-.133" df="1"/>
<PCell targetCategory="1" parameterName="p8" beta="7.885E-02" df="1"/>
<PCell targetCategory="2" parameterName="p0" beta="31.077" df="1"/>
<PCell targetCategory="2" parameterName="p1" beta="-.869" df="1"/>
<PCell targetCategory="2" parameterName="p3" beta="-18.99" df="1"/>
<PCell targetCategory="2" parameterName="p5" beta="1.01" df="1"/>
<PCell targetCategory="2" parameterName="p7" beta="-.3" df="1"/>
<PCell targetCategory="2" parameterName="p8" beta=".152" df="1"/>
<PCell targetCategory="3" parameterName="p0" beta="6.836" df="1"/>
<PCell targetCategory="3" parameterName="p1" beta="16.305" df="1"/>
<PCell targetCategory="3" parameterName="p3" beta="-20.041" df="1"/>
<PCell targetCategory="3" parameterName="p5" beta="-.73" df="1"/>
<PCell targetCategory="3" parameterName="p7" beta="-.156" df="1"/>
<PCell targetCategory="3" parameterName="p8" beta=".267" df="1"/>
<PCell targetCategory="4" parameterName="p0" beta="8.816" df="1"/>
<PCell targetCategory="4" parameterName="p1" beta="15.264" df="1"/>
<PCell targetCategory="4" parameterName="p3" beta="-16.799" df="1"/>
<PCell targetCategory="4" parameterName="p5" beta="16.48" df="1"/>
<PCell targetCategory="4" parameterName="p7" beta="-.133" df="1"/>
<PCell targetCategory="4" parameterName="p8" beta="-.16" df="1"/>
<PCell targetCategory="5" parameterName="p0" beta="5.862" df="1"/>
<PCell targetCategory="5" parameterName="p1" beta="16.437" df="1"/>
<PCell targetCategory="5" parameterName="p3" beta="-17.309" df="1"/>
<PCell targetCategory="5" parameterName="p5" beta="15.888" df="1"/>
<PCell targetCategory="5" parameterName="p7" beta="-.105" df="1"/>
<PCell targetCategory="5" parameterName="p8" beta="6.914E-02" df="1"/>
<PCell targetCategory="6" parameterName="p0" beta="6.495" df="1"/>
<PCell targetCategory="6" parameterName="p1" beta="17.297" df="1"/>
<PCell targetCategory="6" parameterName="p3" beta="-19.098" df="1"/>
<PCell targetCategory="6" parameterName="p5" beta="16.841" df="1"/>
<PCell targetCategory="6" parameterName="p7" beta="-.141" df="1"/>
<PCell targetCategory="6" parameterName="p8" beta="-5.058E-02" df="1"/>
</ParamMatrix>
</GeneralRegressionModel>
</PMML>
Scoring Algorithm
We will use the above example to illustrate the steps that should be
followed in the scoring process. Say the following case (observation) must be
scored:
obs = (sex=1 minority=0 age=25 work=4)
- Do model file parsing. Reconstruct the PPMatrix and the Parameter
matrix.
- To score a case, construct the vector x (of length equal to the
number of Parameters in the model) as follows.
- If row i of the PP correlation matrix is empty, that means
the i-th parameter is an intercept, set xi =
1.
- If row of the PP correlation matrix is nonempty and corresponds to
a factor value or set of factor values, set xi to
1 if the case being scored matches this row, 0 if it does
not.
- If row i of the PP correlation matrix is nonempty and
corresponds to a covariate c, with the entry r, then
r is the multiplicity of the covariate c in the
parameter, so set xi=cr using the value of
c in the record.
- If row i of the PP correlation matrix is nonempty and
corresponds to a number of covariates, then set xi to
be the product of covariate values (from the record) using their
corresponding multiplicities found in the PP matrix row.
- Finally, if row i of the PP correlation matrix is nonempty
and corresponds to a combination of factors and covariates, then set
xi to be the product of covariate values (from the
record) using their corresponding multiplicities found in the PP matrix
if the factor values in the record match those in the PP matrix row and
0 otherwise.
- Now for each response category (value of the target variable) j,
let βj be the vector of Parameter estimates for that
response category. (If k is the last response category, remember
that by convention β k= 0.) Set r j=
<x,βj > and s j= exp ri.
The probability that our case falls into category j is then p
j= sj/ (s1 + ... + s
k).
- If you just want to assign each case to the category into which it has
the highest probability of falling, it is not necessary to compute anything
after rj; the category whose rj value
is highest is the one you want. If you want to compute the actual
probabilities (for instance, in order to know whether you are assigning a
case to a 51% good or a 99% good category), we use a little dodge to avoid
overflow. Namely, pj is the reciprocal of exp
(r1-rj ) +... + exp
(rk-rj). If ri-rj>
700 for any i, then the exponential will overflow; but in this
case Pj is so small that we can set it to zero. Underflow
in the denominator can be ignored since the term exp
(rj-rj) ensures the denominator is at least
1.
General Regression Samples: General Linear Example
The information about the variables is the same as in the previous
example, but now the target variable JOBCAT is considered to be
continuous.
The Predictor-to-Parameter combinations mapping is the same as above. The
corresponding XML model is:
<PMML xmlns="https://www.dmg.org/PMML-4_2" version="4.2">
<Header copyright="dmg.org"/>
<DataDictionary numberOfFields="5">
<DataField name="jobcat" optype="continuous" dataType="double"/>
<DataField name="minority" optype="categorical" dataType="double"/>
<DataField name="sex" optype="categorical" dataType="double"/>
<DataField name="age" optype="continuous" dataType="double"/>
<DataField name="work" optype="continuous" dataType="double"/>
</DataDictionary>
<GeneralRegressionModel modelType="generalLinear" functionName="regression">
<MiningSchema>
<MiningField name="jobcat" usageType="target"/>
<MiningField name="minority" usageType="active"/>
<MiningField name="sex" usageType="active"/>
<MiningField name="age" usageType="active"/>
<MiningField name="work" usageType="active"/>
</MiningSchema>
<ParameterList>
<Parameter name="p0" label="Intercept"/>
<Parameter name="p1" label="[SEX=0]"/>
<Parameter name="p2" label="[SEX=1]"/>
<Parameter name="p3" label="[MINORITY=0]([SEX=0])"/>
<Parameter name="p4" label="[MINORITY=1]([SEX=0])"/>
<Parameter name="p5" label="[MINORITY=0]([SEX=1])"/>
<Parameter name="p6" label="[MINORITY=1]([SEX=1])"/>
<Parameter name="p7" label="age"/>
<Parameter name="p8" label="work"/>
</ParameterList>
<FactorList>
<Predictor name="sex"/>
<Predictor name="minority"/>
</FactorList>
<CovariateList>
<Predictor name="age"/>
<Predictor name="work"/>
</CovariateList>
<PPMatrix>
<PPCell value="0" predictorName="sex" parameterName="p1"/>
<PPCell value="1" predictorName="sex" parameterName="p2"/>
<PPCell value="0" predictorName="sex" parameterName="p3"/>
<PPCell value="0" predictorName="sex" parameterName="p4"/>
<PPCell value="1" predictorName="sex" parameterName="p5"/>
<PPCell value="1" predictorName="sex" parameterName="p6"/>
<PPCell value="0" predictorName="minority" parameterName="p3"/>
<PPCell value="1" predictorName="minority" parameterName="p4"/>
<PPCell value="0" predictorName="minority" parameterName="p5"/>
<PPCell value="1" predictorName="minority" parameterName="p6"/>
<PPCell value="1" predictorName="age" parameterName="p7"/>
<PPCell value="1" predictorName="work" parameterName="p8"/>
</PPMatrix>
<ParamMatrix>
<PCell parameterName="p0" beta="1.602" df="1"/>
<PCell parameterName="p1" beta="0.580" df="1"/>
<PCell parameterName="p3" beta="0.831" df="1"/>
<PCell parameterName="p5" beta="0.429" df="1"/>
<PCell parameterName="p7" beta="-0.012" df="1"/>
<PCell parameterName="p8" beta="0.010" df="1"/>
</ParamMatrix>
</GeneralRegressionModel>
</PMML>
Scoring Algorithm
For this example the steps that should be followed in the scoring process
are similar to the previous one but fewer. Say the following case
(observation) must be scored:
obs = (sex=1 minority=0 age=25 work=4)
- Do model file parsing. Reconstruct the PPMatrix and the Parameter
matrix.
- To score a case, construct the vector x (of length equal to the number
of Parameters in the model) as follows.
- If row i of the PP correlation matrix is empty, set x
i = 1.
- If row of the PP correlation matrix is nonempty and corresponds to
a factor value or set of factor values, set x i to
1 if the case being scored matches this row, 0 if it does
not.
- If row i of the PP correlation matrix is nonempty and
corresponds to a covariate c, with the entry r, then
r is the multiplicity of the covariate c in the
parameter, so set xi=cr using the value of
c in the record.
- If row i of the PP correlation matrix is nonempty and
corresponds to a number of covariates, then set xi to
be the product of covariate values (from the record) using their
corresponding multiplicities found in the PP matrix row.
- Finally, if row i of the PP correlation matrix is nonempty
and corresponds to a combination of factors and covariates, then set
xi to be the product of covariate values (from the
record) using their corresponding multiplicities found in the PP matrix
if the factor values in the record match those in the PP matrix row and
0 otherwise.
- Now let β be the vector of Parameter estimates. The inner
product r = <x,β> is the predicted value for the considered
case.
General Regression Samples: Ordinal Multinomial Example
The information about the variables is the same as in the previous
examples, but now the target variable JOBCAT is considered to be ordinal.
The order is very important for ordinal fields. Therefore, a list of all
valid values must be present in element DataField anytime attribute
optype is set to ordinal. In this way, the sequence of values is determined by
the order in which they appear in element DataField, from top to bottom.
The Predictor-to-Parameter combinations mapping is the same as above. The
corresponding XML model is:
<PMML xmlns="https://www.dmg.org/PMML-4_2" version="4.2">
<Header copyright="dmg.org"/>
<DataDictionary numberOfFields="5">
<DataField name="jobcat" optype="ordinal" dataType="integer">
<Value value="1" displayValue="Clerical"/>
<Value value="2" displayValue="Office trainee"/>
<Value value="3" displayValue="Security officer"/>
<Value value="4" displayValue="College trainee"/>
<Value value="5" displayValue="Exempt employee"/>
<Value value="6" displayValue="MBA trainee"/>
<Value value="7" displayValue="Technical"/>
</DataField>
<DataField name="minority" optype="categorical" dataType="double">
<Value value="0" displayValue="Non-Minority"/>
<Value value="1" displayValue="Minority"/>
</DataField>
<DataField name="sex" optype="categorical" dataType="double">
<Value value="0" displayValue="Male"/>
<Value value="1" displayValue="Female"/>
</DataField>
<DataField name="age" optype="continuous" dataType="double"/>
<DataField name="work" optype="continuous" dataType="double"/>
</DataDictionary>
<GeneralRegressionModel modelType="ordinalMultinomial" functionName="classification" cumulativeLink="logit">
<MiningSchema>
<MiningField name="jobcat" usageType="target"/>
<MiningField name="minority" usageType="active"/>
<MiningField name="sex" usageType="active"/>
<MiningField name="age" usageType="active"/>
<MiningField name="work" usageType="active"/>
</MiningSchema>
<ParameterList>
<Parameter name="p0" label="Intercept"/>
<Parameter name="p1" label="[SEX=0]"/>
<Parameter name="p2" label="[SEX=1]"/>
<Parameter name="p3" label="[MINORITY=0]([SEX=0])"/>
<Parameter name="p4" label="[MINORITY=1]([SEX=0])"/>
<Parameter name="p5" label="[MINORITY=0]([SEX=1])"/>
<Parameter name="p6" label="[MINORITY=1]([SEX=1])"/>
<Parameter name="p7" label="age"/>
<Parameter name="p8" label="work"/>
</ParameterList>
<FactorList>
<Predictor name="sex"/>
<Predictor name="minority"/>
</FactorList>
<CovariateList>
<Predictor name="age"/>
<Predictor name="work"/>
</CovariateList>
<PPMatrix>
<PPCell value="0" predictorName="sex" parameterName="p1"/>
<PPCell value="1" predictorName="sex" parameterName="p2"/>
<PPCell value="0" predictorName="sex" parameterName="p3"/>
<PPCell value="0" predictorName="sex" parameterName="p4"/>
<PPCell value="1" predictorName="sex" parameterName="p5"/>
<PPCell value="1" predictorName="sex" parameterName="p6"/>
<PPCell value="0" predictorName="minority" parameterName="p3"/>
<PPCell value="1" predictorName="minority" parameterName="p4"/>
<PPCell value="0" predictorName="minority" parameterName="p5"/>
<PPCell value="1" predictorName="minority" parameterName="p6"/>
<PPCell value="1" predictorName="age" parameterName="p7"/>
<PPCell value="1" predictorName="work" parameterName="p8"/>
</PPMatrix>
<ParamMatrix>
<PCell targetCategory="1" parameterName="p0" beta="-0.683" df="1"/>
<PCell targetCategory="2" parameterName="p0" beta="0.723" df="1"/>
<PCell targetCategory="3" parameterName="p0" beta="1.104" df="1"/>
<PCell targetCategory="4" parameterName="p0" beta="1.922" df="1"/>
<PCell targetCategory="5" parameterName="p0" beta="3.386" df="1"/>
<PCell targetCategory="6" parameterName="p0" beta="4.006" df="1"/>
<PCell parameterName="p1" beta="1.096" df="1"/>
<PCell parameterName="p3" beta="0.957" df="1"/>
<PCell parameterName="p5" beta="1.149" df="1"/>
<PCell parameterName="p7" beta="-0.067" df="1"/>
<PCell parameterName="p8" beta="0.060" df="1"/>
</ParamMatrix>
</GeneralRegressionModel>
</PMML>
Scoring Algorithm
For this example the steps that should be followed in the scoring process
are somewhat similar to the first example but also the link function is used.
Say the following case (observation) must be scored:
obs = (sex=1 minority=0 age=25 work=4)
- Do model file parsing. Reconstruct the PPMatrix and the Parameter
matrix.
- To score a case, construct the vector x (of length equal to the number
of Parameters in the model) as follows.
- If row i of the PP correlation matrix is empty, set x
i= 1.
- If row of the PP correlation matrix is nonempty and corresponds to
a factor value or set of factor values, set x i to
1 if the case being scored matches this row, 0 if it does
not.
- If row i of the PP correlation matrix is nonempty and
corresponds to a covariate c, with the entry r, then
r is the multiplicity of the covariate c in the
parameter, so set xi=cr using the value of
c in the record.
- If row i of the PP correlation matrix is nonempty and
corresponds to a number of covariates, then set xi to
be the product of covariate values (from the record) using their
corresponding multiplicities found in the PP matrix row.
- Finally, if row i of the PP correlation matrix is nonempty
and corresponds to a combination of factors and covariates, then set
xi to be the product of covariate values (from the
record) using their corresponding multiplicities found in the PP matrix
if the factor values in the record match those in the PP matrix row and
0 otherwise.
- Obtain the values of the offsetVariable or
offsetValue a.
Set
- a = value from the observation information if
offsetVariable is used
- a = offsetValue from the XML file if offsetValue is
used
- a = 0 otherwise.
- When the target variable has only two categories, the inverse of link
function transforms the value predicted by the regression equation into the
corresponding probability of the first target category. If target variable
is ordinal with more than two categories, a different intercept parameter
value is specified by the model for each target category except the last.
Inverse of the link function transforms value predicted by the regression
equation with specified intercept value into the corresponding cumulative
probability for the given category.
How to compute pj := probability of
target=Valuej
For each response category (value of the target variable) j, let
βj be the vector of Parameter estimates for that response
category. (If k is the last response category, βk is
not specified.) For the given case let <x,βj> be the
result of evaluating the inner product just like in the multinomialLogistic
model and yj = <x,βj> + a. Predicted
probability for each category is then computed according to the following
formulas:
p1 = F(y1)
pj = F(yj) - F(yj-1) , for 2 ≤ j <
k
pk = 1 - F(yk-1)
Function F is an inverse of the specified link function:
- logit, ordinal
- inverse of logit function: F(y)= 1/(1+exp(-y)).
- probit, ordinal
- inverse of probit function: F(y)= integral(from -∞ to
y)(1/sqrt(2*π))exp(-0.5*u*u)du.
- cloglog, ordinal
- inverse of cloglog function: F(y)= 1 - exp( -exp(y) ).
- loglog, ordinal
- inverse of loglog function: F(y)= exp( -exp(-y) ).
- cauchit, ordinal
- inverse of cauchit function: F(y)= 0.5 + (1/π)
arctan(y).
General Regression Samples: Simple Regression Example
Only two continuous predictors are used in this example, and the target
variable JOBCAT is considered to be continuous.
The Predictor-to-Parameter combinations mapping is trivial. The
corresponding XML model is:
<PMML xmlns="https://www.dmg.org/PMML-4_2" version="4.2">
<Header copyright="dmg.org"/>
<DataDictionary numberOfFields="5">
<DataField name="jobcat" optype="continuous" dataType="double"/>
<DataField name="minority" optype="continuous" dataType="double"/>
<DataField name="sex" optype="continuous" dataType="double"/>
<DataField name="age" optype="continuous" dataType="double"/>
<DataField name="work" optype="continuous" dataType="double"/>
</DataDictionary>
<GeneralRegressionModel modelType="regression" functionName="regression">
<MiningSchema>
<MiningField name="jobcat" usageType="target"/>
<MiningField name="age" usageType="active"/>
<MiningField name="work" usageType="active"/>
</MiningSchema>
<ParameterList>
<Parameter name="p0" label="Intercept"/>
<Parameter name="p1" label="age"/>
<Parameter name="p2" label="work"/>
</ParameterList>
<CovariateList>
<Predictor name="age"/>
<Predictor name="work"/>
</CovariateList>
<PPMatrix>
<PPCell value="1" predictorName="age" parameterName="p1"/>
<PPCell value="1" predictorName="work" parameterName="p2"/>
</PPMatrix>
<ParamMatrix>
<PCell parameterName="p0" beta="2.922" df="1"/>
<PCell parameterName="p1" beta="-0.031" df="1"/>
<PCell parameterName="p2" beta="0.034" df="1"/>
</ParamMatrix>
</GeneralRegressionModel>
</PMML>
Scoring Algorithm
For this example the steps that should be followed in the scoring process
are somewhat similar to the general linear example but are even simpler. Say
the following case (observation) must be scored:
obs = (age=25 work=4)
- Do model file parsing. Reconstruct the PPMatrix and the
Parameter matrix.
- To score a case, construct the vector x (of length equal to the
number of Parameters in the model) as follows.
- If row i of the PP correlation matrix is empty, set x
i = 1.
- If row i of the PP correlation matrix is nonempty and
corresponds to a covariate c, the row should contain exactly one
nonzero entry, in the column corresponding to the independent variable
c. The value of this entry should be 1 since it is a
linear model, so set xi=x (using the value of
c which appears in this case).
- Now let β be the vector of Parameter estimates. The inner
product r = <x,β> is the predicted value for the considered
case.
General Regression Samples: Generalized Linear Model Example
The information about the variables is the same as in the previous
examples, but now the target variable JOBCAT is considered to be
continuous.
The Predictor-to-Parameter combinations mapping is the same as above. The
corresponding XML model is:
<PMML xmlns="https://www.dmg.org/PMML-4_2" version="4.2">
<Header copyright="dmg.org"/>
<DataDictionary numberOfFields="5">
<DataField name="jobcat" optype="continuous" dataType="double"/>
<DataField name="minority" optype="categorical" dataType="double"/>
<DataField name="sex" optype="categorical" dataType="double"/>
<DataField name="age" optype="continuous" dataType="double"/>
<DataField name="work" optype="continuous" dataType="double"/>
</DataDictionary>
<GeneralRegressionModel modelType="generalizedLinear" modelName="GZLM" functionName="regression" distribution="gamma" linkFunction="power" linkParameter="-1" offsetValue="3">
<MiningSchema>
<MiningField name="jobcat" usageType="target"/>
<MiningField name="minority" usageType="active"/>
<MiningField name="sex" usageType="active"/>
<MiningField name="age" usageType="active"/>
<MiningField name="work" usageType="active"/>
</MiningSchema>
<ParameterList>
<Parameter name="p0" label="Intercept"/>
<Parameter name="p1" label="[SEX=0]"/>
<Parameter name="p2" label="[SEX=1]"/>
<Parameter name="p3" label="[MINORITY=0]([SEX=0])"/>
<Parameter name="p4" label="[MINORITY=1]([SEX=0])"/>
<Parameter name="p5" label="[MINORITY=0]([SEX=1])"/>
<Parameter name="p6" label="[MINORITY=1]([SEX=1])"/>
<Parameter name="p7" label="age"/>
<Parameter name="p8" label="work"/>
</ParameterList>
<FactorList>
<Predictor name="sex"/>
<Predictor name="minority"/>
</FactorList>
<CovariateList>
<Predictor name="age"/>
<Predictor name="work"/>
</CovariateList>
<PPMatrix>
<PPCell value="0" predictorName="sex" parameterName="p1"/>
<PPCell value="1" predictorName="sex" parameterName="p2"/>
<PPCell value="0" predictorName="sex" parameterName="p3"/>
<PPCell value="0" predictorName="sex" parameterName="p4"/>
<PPCell value="1" predictorName="sex" parameterName="p5"/>
<PPCell value="1" predictorName="sex" parameterName="p6"/>
<PPCell value="0" predictorName="minority" parameterName="p3"/>
<PPCell value="1" predictorName="minority" parameterName="p4"/>
<PPCell value="0" predictorName="minority" parameterName="p5"/>
<PPCell value="1" predictorName="minority" parameterName="p6"/>
<PPCell value="1" predictorName="age" parameterName="p7"/>
<PPCell value="1" predictorName="work" parameterName="p8"/>
</PPMatrix>
<ParamMatrix>
<PCell parameterName="p0" beta="-2.30824444845005" df="1"/>
<PCell parameterName="p1" beta="-0.268177596945098" df="1"/>
<PCell parameterName="p3" beta="-0.169104566719988" df="1"/>
<PCell parameterName="p5" beta="-0.219215962160056" df="1"/>
<PCell parameterName="p7" beta="0.00427629446211706" df="1"/>
<PCell parameterName="p8" beta="-0.00397117497757107" df="1"/>
</ParamMatrix>
</GeneralRegressionModel>
</PMML>
Scoring Algorithm
For this example the steps that should be followed in the scoring process
are somewhat similar to the second example but also the link function is
used. Say the following case (observation) must be scored:
obs = (sex=1 minority=0 age=25 work=4)
- Do model file parsing. Reconstruct the PPMatrix and the Parameter
matrix.
- To score a case, construct the vector x (of length equal to the number
of Parameters in the model) as follows.
- If row i of the PP correlation matrix is empty, set x
i = 1.
- If row of the PP correlation matrix is nonempty and corresponds to
a factor value or set of factor values, set x i to
1 if the case being scored matches this row, 0 if it does
not.
- If row i of the PP correlation matrix is nonempty and
corresponds to a covariate c, with the entry r, then
r is the multiplicity of the covariate c in the
parameter, so set xi=cr using the value of
c in the record.
- If row i of the PP correlation matrix is nonempty and
corresponds to a number of covariates, then set xi to
be the product of covariate values (from the record) using their
corresponding multiplicities found in the PP matrix row.
- Finally, if row i of the PP correlation matrix is nonempty
and corresponds to a combination of factors and covariates, then set
xi to be the product of covariate values (from the
record) using their corresponding multiplicities found in the PP matrix
if the factor values in the record match those in the PP matrix row and
0 otherwise.
- Obtain the values of the offsetVariable or
offsetValue (a), TrialsVariable or
TrialsValue (b), distParameter (c) and
linkParameter (d).
Set
- a = value from the observation information if
offsetVariable is used
- a = offsetValue from the XML file if offsetValue is
used
- a = 0 otherwise
Set
- b = value from the observation information if
TrialsVariable is used
- b = trialsValue from the XML file if TrialsValue is
used
- b = 1 otherwise
Set
- c = distParameter from the XML file if link = negbin
and distribution = negbin
Set
- d = linkParameter from the XML file if link =
oddspower or power.
- Let β be the vector of Parameter estimates and
<x,β> be the inner product of two vectors x and
β. The predicted value for the considered case is F(<x,β> +
a)*b,
where function F is an inverse of the specified link
function:
- cloglog
- inverse of cloglog function: F(y) = 1 - exp( -exp(y)
).
- identity
- inverse of identity function: F(y) = y.
- log
- inverse of log function: F(y) = exp(y).
- logc
- inverse of logc function: F(y) = 1 - exp(y).
- logit
- inverse of logit function: F(y)= 1/(1 + exp(-y)).
- loglog
- inverse of loglog function: F(y) = exp( -exp(-y) ).
- negbin(c)
- inverse of negbin(c) function: F(y) = 1/(c(exp(-y) -
1)).
- oddspower(d)
- inverse of oddspower(d) function:
- F(y) = 1/(1 + (1 + d*y)-1/d) if
d!=0;
- F(y) = 1/(1 + exp(-y)) if d=0.
- power(d)
- inverse of power(d) function:
- F(y) = y1/d if d!=0;
- F(y) = exp(y) if d=0.
- probit
- inverse of probit function: F(y) = integral(from -∞ to
y)(1/sqrt(2*π))exp(-0.5*u*u)du.
General Regression Samples: Example of a model with contrast
matrices
The following example illustrates the use of contrast
matrices in a regression model. Here salCat is a target variable
with two categories, "Low" and "High". There are two factors in the model
and two covariates. The factor gender has two categories and
uses "Simple" contrast matrix, while jobcat has three categories
and "Helmert" contrast matrix. The model uses main effects and some
interaction effects as indicated in PPMatrix and in parameter
labels.
<GeneralRegressionModel modelType="multinomialLogistic" modelName="contrastLogistic" functionName="classification" targetReferenceCategory="High">
<MiningSchema>
<MiningField name="salCat" usageType="target"/>
<MiningField name="gender" usageType="active" missingValueTreatment="asIs"/>
<MiningField name="educ" usageType="active" missingValueTreatment="asIs"/>
<MiningField name="jobcat" usageType="active" missingValueTreatment="asIs"/>
<MiningField name="salbegin" usageType="active" missingValueTreatment="asIs"/>
</MiningSchema>
<ParameterList>
<Parameter name="P0000001" label="Constant"/>
<Parameter name="P0000002" label="gender(1)"/>
<Parameter name="P0000003" label="educ"/>
<Parameter name="P0000004" label="jobcat(1)"/>
<Parameter name="P0000005" label="jobcat(2)"/>
<Parameter name="P0000006" label="gender(1) by jobcat(1)"/>
<Parameter name="P0000007" label="gender(1) by jobcat(2)"/>
<Parameter name="P0000008" label="educ by gender(1) by salbegin"/>
</ParameterList>
<FactorList>
<Predictor name="gender" contrastMatrixType="Simple">
<Categories>
<Category value="f"/>
<Category value="m"/>
</Categories>
<Matrix nbRows="2" nbCols="1">
<Array type="real" n="1">.5</Array>
<Array type="real" n="1">-.5</Array>
</Matrix>
</Predictor>
<Predictor name="jobcat" contrastMatrixType="Helmert">
<Categories>
<Category value="1"/>
<Category value="2"/>
<Category value="3"/>
</Categories>
<Matrix nbRows="3" nbCols="2">
<Array type="real" n="2">.666666666667 0</Array>
<Array type="real" n="2">-.333333333333 .5</Array>
<Array type="real" n="2">-.333333333333 -.5</Array>
</Matrix>
</Predictor>
</FactorList>
<CovariateList>
<Predictor name="educ"/>
<Predictor name="salbegin"/>
</CovariateList>
<PPMatrix>
<PPCell value="f" predictorName="gender" parameterName="P0000002"/>
<PPCell value="1" predictorName="educ" parameterName="P0000003"/>
<PPCell value="1" predictorName="jobcat" parameterName="P0000004"/>
<PPCell value="2" predictorName="jobcat" parameterName="P0000005"/>
<PPCell value="f" predictorName="gender" parameterName="P0000006"/>
<PPCell value="1" predictorName="jobcat" parameterName="P0000006"/>
<PPCell value="f" predictorName="gender" parameterName="P0000007"/>
<PPCell value="2" predictorName="jobcat" parameterName="P0000007"/>
<PPCell value="1" predictorName="educ" parameterName="P0000008"/>
<PPCell value="f" predictorName="gender" parameterName="P0000008"/>
<PPCell value="1" predictorName="salbegin" parameterName="P0000008"/>
</PPMatrix>
<ParamMatrix>
<PCell targetCategory="Low" parameterName="P0000001" beta="17.0599111512836" df="1"/>
<PCell targetCategory="Low" parameterName="P0000002" beta="-2.79578119817189" df="1"/>
<PCell targetCategory="Low" parameterName="P0000003" beta="-0.625739483585618" df="1"/>
<PCell targetCategory="Low" parameterName="P0000004" beta="-5.76523337984277" df="1"/>
<PCell targetCategory="Low" parameterName="P0000005" beta="17.743574615114" df="1"/>
<PCell targetCategory="Low" parameterName="P0000006" beta="0.421913613872923" df="1"/>
<PCell targetCategory="Low" parameterName="P0000007" beta="0" df="0"/>
<PCell targetCategory="Low" parameterName="P0000008" beta="1.1136356754678E-005" df="1"/>
</ParamMatrix>
</GeneralRegressionModel>
Scoring Algorithm
For this example the following steps are needed to score the case
obs = ( gender="f" educ=19 jobcat=3 salbegin=45000 )
Note that as indicated by the Categories element, the categories for
factor gender have the following indices: "f" is 1, "m" is 2. For
jobcat the categories are "1", "2", "3".
- Do model file parsing. Reconstruct the PPMatrix, the Parameter
matrix, contrast matrices Cgender and Cjobcat for
the factors.
- To score the above case, construct the vector x (of length equal to
the number of Parameters in the model, 8) as follows.
- If row i of the PP correlation matrix is empty, set x
i = 1. Here
x1=1.
- If row of the PP correlation matrix corresponds to exactly one
factor value, set x i to the entry of the
contrast matrix for this factor with row index defined by the
factor category in the record and column index defined by the
category in the PP matrix. For our example we get:
x2=Cgender(1,1)=0.5
x4=Cjobcat(3,1)=-0.333333333333
x5=Cjobcat(3,2)=-0.5
- If row of the PP correlation matrix corresponds to a set of
factor values, set xi to the product of the
entries of contrast matrices as described above. In our example we
get:
x6=Cgender(1,1)*Cjobcat(3,1)=0.5*(-0.333333333333)=-0.16666666666666
x7=Cgender(1,1)*Cjobcat(3,2)=0.5*(-0.5)=-0.25
- If row i of the PP correlation matrix corresponds to a
covariate c, with the entry r, then r is the
multiplicity of the covariate c in the parameter, so set
xi=cr using the value of c in
the record. In our example we get:
x3=educ=19
- If row i of the PP correlation matrix corresponds to a
number of covariates, then set xi to be the
product of covariate values (from the record) using their
corresponding multiplicities found in the PP matrix row.
- Finally, if row i of the PP correlation matrix
corresponds to a combination of factors and covariates, then set
xi to be the product of covariate values (from
the record) using their corresponding multiplicities found in the
PP matrix times the product of contrast matrix entries as described
above. In the example we get:
x8=educ*Cgender(1,1)*salbegin=19*0.5*45000=427500
- Now let β be the vector of Parameter estimates (for a
specified target category, if applicable). The inner product r =
<x,β> can be used as described before for various types of
regression models. In our example the probability of target category
"Low" will be computed as p("Low") = exp( r )/(1 + exp( r )
).
General Regression Samples: Cox Regression Model Example
Cox proportional hazards model of survival is often used in real-life
research studies in various industries including pharmaceutical and
telecommunications. The idea is as follows: the data must contain an
end time variable and a status variable, in addition to any
number of predictor variables, and optionally a baseline strata
variable, a start time variable, and a subject ID
variable. Usually the status variable has certain values or intervals
of values that are considered "an event", such as the death of a patient
or a telephone customer switching to a competing carrier. The event is
happening or not at the time indicated by the end time variable.
Survival is the probability of the event not happening.
Cumulative hazard is defined as the negative log of survival. The
main assumption is that cumulative hazard for a case with predictors
x at time t is computed as
H( t | x ) = H0( t ) exp( x' * β ),
where H0(t) is the baseline cumulative hazard
at time t, vector β has regression parameter estimates. The
probability of survival is
S( t | x ) = exp( -H( t | x ) ).
The start time and subject ID variables provide an opportunity to
represent time-dependent predictors that often appear in survival models. The
baseline strata variable, if present, divides all data into several strata
based on its categories, with separate baseline hazard values for each stratum.
The same regression coefficients β are used in all strata.
In the following examples variable childs is used as the end
time variable, variable life is the status variable with value 1
corresponding to the "event", happy and educ are a
factor and a covariate, respectively, and the model is using their main
effects and their interaction term. The first example does not have a
baseline strata variable, while the second one uses region for
that.
<GeneralRegressionModel modelType="CoxRegression" modelName="CSCox" functionName="regression" endTimeVariable="childs" statusVariable="life">
<MiningSchema>
<MiningField name="childs" usageType="active" missingValueTreatment="asIs"/>
<MiningField name="happy" usageType="active" missingValueTreatment="asIs"/>
<MiningField name="educ" usageType="active" missingValueTreatment="asIs"/>
<MiningField name="life" usageType="target"/>
</MiningSchema>
<ParameterList>
<Parameter name="P0000001" label="[happy=1]" referencePoint="0"/>
<Parameter name="P0000002" label="[happy=2]" referencePoint="0"/>
<Parameter name="P0000003" label="[happy=3]" referencePoint="0"/>
<Parameter name="P0000004" label="educ" referencePoint="12.85536159601"/>
<Parameter name="P0000005" label="[happy=1] * educ" referencePoint="0"/>
<Parameter name="P0000006" label="[happy=2] * educ" referencePoint="0"/>
<Parameter name="P0000007" label="[happy=3] * educ" referencePoint="0"/>
</ParameterList>
<FactorList>
<Predictor name="happy"/>
</FactorList>
<CovariateList>
<Predictor name="educ"/>
</CovariateList>
<PPMatrix>
<PPCell value="1" predictorName="happy" parameterName="P0000001"/>
<PPCell value="2" predictorName="happy" parameterName="P0000002"/>
<PPCell value="3" predictorName="happy" parameterName="P0000003"/>
<PPCell value="1" predictorName="educ" parameterName="P0000004"/>
<PPCell value="1" predictorName="happy" parameterName="P0000005"/>
<PPCell value="1" predictorName="educ" parameterName="P0000005"/>
<PPCell value="2" predictorName="happy" parameterName="P0000006"/>
<PPCell value="1" predictorName="educ" parameterName="P0000006"/>
<PPCell value="3" predictorName="happy" parameterName="P0000007"/>
<PPCell value="1" predictorName="educ" parameterName="P0000007"/>
</PPMatrix>
<ParamMatrix>
<PCell parameterName="P0000001" beta="2.19176500383392" df="1"/>
<PCell parameterName="P0000002" beta="0.839584538765938" df="1"/>
<PCell parameterName="P0000003" beta="0" df="0"/>
<PCell parameterName="P0000004" beta="0.207006511267958" df="1"/>
<PCell parameterName="P0000005" beta="-0.124788379173099" df="1"/>
<PCell parameterName="P0000006" beta="-0.0652692443310469" df="1"/>
<PCell parameterName="P0000007" beta="0" df="0"/>
</ParamMatrix>
<EventValues>
<Value value="1"/>
</EventValues>
<BaseCumHazardTables maxTime="8">
<BaselineCell time="1" cumHazard="0.0805149154781295"/>
<BaselineCell time="2" cumHazard="0.208621561646413"/>
<BaselineCell time="3" cumHazard="0.367889107749672"/>
<BaselineCell time="4" cumHazard="0.610515527436034"/>
<BaselineCell time="5" cumHazard="0.782436645962723"/>
<BaselineCell time="6" cumHazard="0.898256334351415"/>
<BaselineCell time="7" cumHazard="1.34645277785058"/>
<BaselineCell time="8" cumHazard="1.92644296943848"/>
</BaseCumHazardTables>
</GeneralRegressionModel>
<GeneralRegressionModel modelType="CoxRegression" modelName="CSCox" functionName="regression" endTimeVariable="childs" statusVariable="life" baselineStrataVariable="region">
<MiningSchema>
<MiningField name="childs" usageType="active" missingValueTreatment="asIs"/>
<MiningField name="happy" usageType="active" missingValueTreatment="asIs"/>
<MiningField name="educ" usageType="active" missingValueTreatment="asIs"/>
<MiningField name="region" usageType="active"/>
<MiningField name="life" usageType="target"/>
</MiningSchema>
<ParameterList>
<Parameter name="P0000001" label="[happy=1]" referencePoint="0"/>
<Parameter name="P0000002" label="[happy=2]" referencePoint="0"/>
<Parameter name="P0000003" label="[happy=3]" referencePoint="0"/>
<Parameter name="P0000004" label="educ" referencePoint="12.85536159601"/>
<Parameter name="P0000005" label="[happy=1] * educ" referencePoint="0"/>
<Parameter name="P0000006" label="[happy=2] * educ" referencePoint="0"/>
<Parameter name="P0000007" label="[happy=3] * educ" referencePoint="0"/>
</ParameterList>
<FactorList>
<Predictor name="happy"/>
</FactorList>
<CovariateList>
<Predictor name="educ"/>
</CovariateList>
<PPMatrix>
<PPCell value="1" predictorName="happy" parameterName="P0000001"/>
<PPCell value="2" predictorName="happy" parameterName="P0000002"/>
<PPCell value="3" predictorName="happy" parameterName="P0000003"/>
<PPCell value="1" predictorName="educ" parameterName="P0000004"/>
<PPCell value="1" predictorName="happy" parameterName="P0000005"/>
<PPCell value="1" predictorName="educ" parameterName="P0000005"/>
<PPCell value="2" predictorName="happy" parameterName="P0000006"/>
<PPCell value="1" predictorName="educ" parameterName="P0000006"/>
<PPCell value="3" predictorName="happy" parameterName="P0000007"/>
<PPCell value="1" predictorName="educ" parameterName="P0000007"/>
</PPMatrix>
<ParamMatrix>
<PCell parameterName="P0000001" beta="1.96429877799117" df="1"/>
<PCell parameterName="P0000002" beta="0.487952271605177" df="1"/>
<PCell parameterName="P0000003" beta="0" df="0"/>
<PCell parameterName="P0000004" beta="0.186388616742954" df="1"/>
<PCell parameterName="P0000005" beta="-0.0964727062694649" df="1"/>
<PCell parameterName="P0000006" beta="-0.0257167272021955" df="1"/>
<PCell parameterName="P0000007" beta="0" df="0"/>
</ParamMatrix>
<EventValues>
<Value value="1"/>
</EventValues>
<BaseCumHazardTables>
<BaselineStratum value="1" label="[region=North East]" maxTime="7">
<BaselineCell time="1" cumHazard="0.0480764996657994"/>
<BaselineCell time="2" cumHazard="0.213530888447458"/>
<BaselineCell time="3" cumHazard="0.347177590555568"/>
<BaselineCell time="4" cumHazard="0.700088580976311"/>
<BaselineCell time="5" cumHazard="0.756857216338272"/>
<BaselineCell time="6" cumHazard="0.880125294006154"/>
<BaselineCell time="7" cumHazard="1.79261158114014"/>
</BaselineStratum>
<BaselineStratum value="2" label="[region=South East]" maxTime="7">
<BaselineCell time="1" cumHazard="0.104783416911293"/>
<BaselineCell time="2" cumHazard="0.149899368179306"/>
<BaselineCell time="3" cumHazard="0.344676164146026"/>
<BaselineCell time="4" cumHazard="0.447807317242553"/>
<BaselineCell time="5" cumHazard="0.602148704727296"/>
<BaselineCell time="6" cumHazard="0.996057753780737"/>
</BaselineStratum>
<BaselineStratum value="3" label="[region=West]" maxTime="8">
<BaselineCell time="1" cumHazard="0.0798136487904092"/>
<BaselineCell time="2" cumHazard="0.148350388305914"/>
<BaselineCell time="3" cumHazard="0.252784132000578"/>
<BaselineCell time="4" cumHazard="0.366288821244008"/>
<BaselineCell time="5" cumHazard="0.562653812085775"/>
<BaselineCell time="6" cumHazard="0.61271473319101"/>
<BaselineCell time="7" cumHazard="0.81698327174713"/>
<BaselineCell time="8" cumHazard="1.28475458929774"/>
</BaselineStratum>
</BaseCumHazardTables>
</GeneralRegressionModel>
Scoring Algorithm
For this example some steps that should be followed in the scoring
process are somewhat similar to the general linear example but also
additional work is done to compute survival and cumulative hazard values
using the equations presented above.
- Check if baseline strata variable is present. If it is, get its
value from the case and check if there is a BaselineStratum
element for that value. If not, return missing values as the result,
else get maxTime from the BaselineStratum's
attribute. In the absence of strata variable get maxTime from
the BaseCumHazardTables element.
- Get the value of the end time variable from the case. If it
is less than the minimum time in a BaselineCell in
the previously chosen BaselineStratum or
BaseCumHazardTables, then predicted survival is 1 and
cumulative hazard is 0. If the time value is greater than
maxTime, return missing value. Otherwise find the
BaselineCell that has the largest time attribute
value that is not greater than the time from the case. Extract baseline
cumulative hazard value H0(t) from its attribute
cumHazard.
- Compute the inner product r = <x,β> as described above
for regression models.
- Compute the inner product of the reference point vector
x0 (its values are located in Parameter
elements) and the parameter estimates β: s =
<x0,β>
- Finally, compute the cumulative hazard and survival:
H( t | x ) = H0( t ) exp( r - s ),
S( t | x ) = exp( -H( t | x ) ).
Note that Cox Regression model can be valid even when there are no
parameters at all. In that case r=0, s=0, so the cumulative hazard
is the same as baseline hazard, and survival is still computed by the
formula presented above.
|