opentag.com
\\ Technologies :: Formats :: SRX

The Segmentation Rules eXchange format is an XML format that allows you to describe rules to use for segmenting text. The rules are based on regular expressions.

Example of an SRX file:

<?xml version="1.0"?>
<srx version="1.0">
 <header segmentsubflows="yes">
  <formathandle type="start" include="no"/>
  <formathandle type="end" include="yes"/>
  <formathandle type="isolated" include="yes"/>
 </header>
 <body>
  <languagerules>
   <languagerule languagerulename="Default">
    <rule break="no">
     <beforebreak>^\s*[0-9]+\.</beforebreak>
     <afterbreak>\s</afterbreak>
    </rule>
    <rule break="no">
     <beforebreak>[Ee][Tt][Cc]\.</beforebreak>
     <afterbreak>\s[a-z]</afterbreak>
    </rule>
    <rule break="no">
     <beforebreak>\sMr\.</beforebreak>
     <afterbreak>\s</afterbreak>
    </rule>
    <rule break="yes">
     <beforebreak>[\.\?!]+</beforebreak>
     <afterbreak>\s</afterbreak>
    </rule>
    <rule break="yes">
     <beforebreak></beforebreak>
     <afterbreak>\n</afterbreak>
    </rule>
   </languagerule>
   <languagerule languagerulename="Japanese">
    <rule break="no">
     <beforebreak>^\s*[0-9]+\.</beforebreak>
     <afterbreak>\s</afterbreak>
    </rule>
    <rule break="no">
     <beforebreak>[Ee][Tt][Cc]\.</beforebreak>
     <afterbreak></afterbreak>
    </rule>
    <rule break="yes">
     <beforebreak>[\.\?!]+</beforebreak>
     <afterbreak>\s</afterbreak>
    </rule>
    <rule break="yes">
     <beforebreak>[\xff61\x3002\xff0e\xff1f\xff01]+</beforebreak>
     <afterbreak></afterbreak>
    </rule>
    <rule break="yes">
     <beforebreak></beforebreak>
     <afterbreak>\n</afterbreak>
    </rule>
   </languagerule>
  </languagerules>
  <maprules>
   <maprule maprulename="Default">
    <languagemap languagepattern="JA.*" languagerulename="Japanese"/>
    <languagemap languagepattern=".*" languagerulename="Default"/>
   </maprule>
  </maprules>
 </body>
</srx>

For more information on SRX see the LISA web pages on this topic.