postprocess.c

The postprocess.c program builds upon basic.c and shows the work flow needed to support post-processing in eduction. The program:

  1. creates and configures an eduction engine.
  2. creates post-processing tasks.
  3. creates an eduction session to process the input.
  4. adds input text to the session.
  5. performs the extraction.
  6. collects the matches from the extraction.
  7. runs post-processing tasks on the matches.
  8. prints the results.
  9. cleans up the post-processing tasks.
#include <stdlib.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <malloc.h>
#include <edk.h>
#include <string.h>

#ifdef _WIN32
#define stat _stat
#define off_t _off_t
#endif // _WIN32
#define BUFLEN 5120

// Helper function
void displayusageinfo() {
    EDK_VERSION_INFO versionInfo;
    EdkGetVersion(&versionInfo);
    if (versionInfo.vChangeSet)
        printf("INFO: Eduction SDK Sample for SDK version v%s.%i\n", versionInfo.versionString, versionInfo.vChangeSet);
    else
        printf("INFO: Eduction SDK Sample for SDK version v%s\n", versionInfo.versionString);
    printf("INFO: SDK Built: %s\n", versionInfo.buildTime);
    printf("INFO: Copyright %s\n", versionInfo.copyright);
    printf("INFO: Usage: sample1 <grammarpath> <entity> <documentpath> <licensepath>\n");
    printf("INFO: Parameters:\n");
    printf("INFO:     <grammarpath>   Path to the grammar file that defines matchable\n");
    printf("INFO:                     entities. The grammar file can be in uncompiled (XML)\n");
    printf("INFO:                     or compiled (ECR) format.\n");
    printf("INFO:     <entities>      Comma deliminted list of entities in the grammar file\n");
    printf("INFO                      to be used for matching.\n");
    printf("INFO:     <documentpath>  Path to the document to be parsed.\n");
    printf("INFO:     <licensepath>   Path to the license file to be used.\n");
}

// Helper function
int fileExists(const char * const szFileName) {
    struct stat buf;
    int exists;
    if (!szFileName)
        return 0;
    exists = stat(szFileName, &buf) == 0;
    return exists;
}

// Helper function
int checkargs(const int argc, char **argv) {
    if (argc != 5) {
        printf("FAIL: Program requires four arguments.\n");
        displayusageinfo();
        return 0;
    }
    printf("INFO: Grammar Path:  %s\n", argv[1]);
    printf("INFO: Entities:      %s\n", argv[2]);
    printf("INFO: Document Path: %s\n", argv[3]);
    printf("INFO: License Path:  %s\n", argv[4]);
    if (!fileExists(argv[1])) {
        printf("FAIL: Grammar path does not exist.\n");
        return 0;
    }
    if (!fileExists(argv[3])) {
        printf("FAIL: Document path does not exist.\n");
        return 0;
    }
    if (!fileExists(argv[4])) {
        printf("FAIL: License path does not exist.\n");
        return 0;
    }
    return 1;
}

// Helper function
char *readFile(const char * const fn) {
    struct stat fnInfo;
    off_t len;
    FILE *f;
    char *buf;
    size_t itemsRead;
    if (stat(fn, &fnInfo)) {
        printf("FAIL: Unable to get file size for \"%s\".\n", fn);
        return NULL;
    }
    len = fnInfo.st_size;
    if (!len) {
        printf("FAIL: Zero byte file size for \"%s\".\n", fn);
        return NULL;
    }
    f = fopen(fn, "rb");
    if (!f) {
        printf("FAIL: Unable to open file \"%s\".\n", fn);
        return NULL;
    }
    buf = (char*)malloc(len+1);
    itemsRead = fread(buf, 1, len, f);
    *(buf+len) = '\0';
    fclose(f);
    if (itemsRead < (size_t)len) {
        free(buf);
        printf("FAIL: Unable to read \"%s\".\n", fn);
        return NULL;
    }
    return buf;
}

// Helper function
int readFirst(const char * const fn, FILE **f, char * const buf, const size_t bufLen, off_t * const fileSize, size_t * const bytesRead) {
    struct stat fnInfo;
    size_t itemsToRead, itemsRead;

    if (stat(fn, &fnInfo)) {
        printf("FAIL: Unable to get file size for \"%s\".\n", fn);
        return 0;
    }

    *fileSize = fnInfo.st_size;
    if (!*fileSize) {
        printf("FAIL: Zero byte file size for \"%s\".\n", fn);
        return 0;
    }

    *f = fopen(fn, "rb");
    if (!*f) {
        printf("FAIL: Unable to open file \"%s\".\n", fn);
        return 0;
    }

    itemsToRead = (off_t)bufLen < *fileSize ? bufLen : (size_t)*fileSize;
    itemsRead = fread(buf, 1, itemsToRead, *f);
    if (itemsRead < itemsToRead) {
        fclose(*f);
        *f = NULL;
        printf("FAIL: Unable to read \"%s\".\n", fn);
        return 0;
    }
    *bytesRead = itemsRead;
    return 1;
}

// Helper function
int readNext(const char * const fn, FILE *f, char * const buf, const size_t bufLen, const size_t bytesRemaining, size_t * const bytesRead) {
    size_t itemsToRead = bufLen < bytesRemaining ? bufLen : bytesRemaining;
    size_t itemsRead;
    if (!itemsToRead) {
        *bytesRead = 0;
        return 1;
    }
    itemsRead = fread(buf, 1, itemsToRead, f);
    if (itemsRead < itemsToRead) {
        printf("FAIL: Unable to continue reading \"%s\".\n", fn);
        return 0;
    }
    *bytesRead = itemsRead;
    return 1;
}

// Main function
int main(int argc, char ** argv)
{
    int32_t nErrCode;
    EdkEngineHandle pEngine;
    char *license;
    const char* szErrorMsg;
    EdkSessionHandle pSession;
    size_t bytesRead, bytesRemaining;
    FILE *f;
    char buf[BUFLEN], componentText[128];
    off_t fileSize;
    const char *szEntityName, *szEntityText, *szOrigText;
    size_t textSize, textLength, origSize, origLength, origOffset, offsetLength;
    double score;
    size_t nComponents, nComponent;
    EdkPostProcessTaskHandle pTask = NULL;
    EdkPostProcessTasksCollectionHandle pTaskSet = NULL;
    EdkPostProcessorHandle pProcessor = NULL;
    EdkMatchesCollectionHandle pMatchSet = NULL;
    EdkMatchHandle pMatch = NULL;
    size_t nMatches = 0;
    size_t nIndex = 0;

    printf("INFO: Program loaded.\n");

    if (!checkargs(argc, argv))
        return -1;
    printf("INFO: Parameters valid.\n");

    if (( nErrCode = EdkEngineCreate( &pEngine )) != EdkSuccess) {
        printf("Unable to create the EDK Engine. Error code: %d\n" , nErrCode);
        return -1;
    }
    printf("INFO: Engine created.\n");

    license = readFile(argv[4]);
    if (!license) {
        EdkEngineDestroy( pEngine );
        return -1;
    }
    printf("INFO: License read.\n");

    // Set the license to the eduction engine
    if ((EdkSetLicenseKey( pEngine, license) != EdkSuccess)) {
        nErrCode = EdkGetLastEngineError(pEngine, &szErrorMsg);
        printf("FAIL: %s (%d)\n" , szErrorMsg, nErrCode);
        free(license);
        EdkEngineDestroy(pEngine);
        return -1;
    }
    free(license);
    printf("INFO: License validated.\n");

    // Configure the eduction engine
    // Settings include:
    //  EnableComponents
    //  EnableUniqueMatches
    //  MaxMatchLength
    //  MaxMatchesPerDoc
    //  MatchWholeWord
    //  TokenWithPunctuation
    //  AllowOverlaps
    //  AllowMultipleResults
    //  MatchCases
    //  Locale
    EdkSetEnableComponents(pEngine, true);

    // Load resource file
    // Call this function repeatedly until all required resource files are loaded
    if ((EdkLoadResourceFile(pEngine, argv[1]) != EdkSuccess)) {
        nErrCode = EdkGetLastEngineError(pEngine, &szErrorMsg);
        printf("FAIL: %s (%d)\n" , szErrorMsg, nErrCode);
        EdkEngineDestroy(pEngine);
        return -1;
    }
    printf("INFO: Resource file loaded.\n");

    // Add an entity to match to against
    // Call this function repeatedly to add all desired entities
    // The entities to be added must be defined in the resource files added above
    if ((EdkAddTargetEntity(pEngine, argv[2]) != EdkSuccess)) {
        nErrCode = EdkGetLastEngineError(pEngine, &szErrorMsg);
        printf("FAIL: %s (%d)\n", szErrorMsg, nErrCode);
        EdkEngineDestroy(pEngine);
        return -1;
    }
    printf("INFO: Grammar(s) initialized.\n");

    /* Set up post processing stuff */
    if ((nErrCode = EdkPostProcessorTaskCreate("test_task", "scripts/turing.lua", argv[2], false, &pTask)) != EdkSuccess)
    {
        printf("Unable to create post processing task. Error code: %d\n" ,         nErrCode);
        EdkEngineDestroy(pEngine);
        return -1;
    }

    printf("INFO: Post process task test_task, with script scripts/turing.lua,     created.\n");

    if ((nErrCode = EdkPostProcessorTasksCollectionCreate(&pTaskSet)) !=     EdkSuccess)
    {
        printf("Unable to create post processing task collection. Error code: %d\        n" , nErrCode);
        EdkPostProcessorTaskDestroy(pTask);
        EdkEngineDestroy(pEngine);
        return -1;
    }

    if ((nErrCode = EdkPostProcessorTasksCollectionAddTask(pTaskSet, pTask)) !=     EdkSuccess)
    {
        printf("Unable to add post processing task to collection. Error code: %d\        n" , nErrCode);
        EdkPostProcessorTaskDestroy(pTask);
        EdkPostProcessorTasksCollectionDestroy(pTaskSet);
        EdkEngineDestroy(pEngine);
        return -1;
    }

    printf("INFO: Post process task list created.\n");

    if ((nErrCode = EdkPostProcessorCreate(pTaskSet, &pProcessor)) != EdkSuccess)
    {
        printf("Unable to add post processing task to collection. Error code: %d\        n" , nErrCode);
        EdkPostProcessorTaskDestroy(pTask);
        EdkPostProcessorTasksCollectionDestroy(pTaskSet);
        EdkEngineDestroy(pEngine);
        return -1;
    }

    printf("INFO: Post processor object initialized.\n");

    // Create an eduction session associated with this eduction engine
    // Multiple sessions can be created and concurrent processing in multithreaded     applications
    if (EdkSessionCreate(pEngine, &pSession) != EdkSuccess) {
        nErrCode = EdkGetLastEngineError(pEngine, &szErrorMsg);
        printf("FAIL: %s (%d)\n" , szErrorMsg, nErrCode);
        EdkEngineDestroy(pEngine);
        return -1;
    }
    printf("INFO: Session created.\n");

    if (!readFirst(argv[3], &f, buf, BUFLEN, &fileSize, &bytesRead)) {
        EdkSessionDestroy(pSession);
        EdkEngineDestroy(pEngine);
        return -1;
    }
    bytesRemaining = (size_t)fileSize - bytesRead;
    printf("INFO: Data file opened and %d byte block reads initiated.\n", BUFLEN);

    // Add input data
    // EdkAddInputText is called repeatedly for as many times as needed until all the input has been exhausted
    // The input data must be UTF-8 encoded.
    // Note: An alternative method of adding input data is to create a data input stream
    while (bytesRead) 
    {
        printf("INFO: Adding data block to engine.\n");
        if ((EdkAddInputText( pSession, buf, bytesRead, bytesRemaining ? false :         true)) != EdkSuccess) 
        {
            fclose(f);
            nErrCode = EdkGetLastSessionError(pSession, &szErrorMsg);
            printf("FAIL: %s (%d)\n" , szErrorMsg, nErrCode);
            EdkSessionDestroy(pSession);
            EdkEngineDestroy(pEngine);
            return -1;
        }
        printf("INFO: Data block added.\n");

        if(!readNext(argv[3], f, buf, BUFLEN, bytesRemaining, &bytesRead)) 
        {
            fclose(f);
            EdkSessionDestroy(pSession);
            EdkEngineDestroy(pEngine);
            return -1;
        }
        bytesRemaining -= bytesRead;
    }

    fclose(f);

    /* Get all matches found from the input and process them */
    if ((nErrCode = EdkFillMatches(pSession, &pMatchSet)) != EdkSuccess)
    {
        printf("Unable to retrieve matches from current session. Error code: %d\        n" , nErrCode);
        EdkPostProcessorTaskDestroy(pTask);
        EdkPostProcessorTasksCollectionDestroy(pTaskSet);
        EdkEngineDestroy(pEngine);
        return -1;
    }

    printf("INFO: Running post-processor on match set.\n");
    EdkPostProcessorRun(pProcessor, pMatchSet);
    EdkGetNumMatches(pMatchSet, &nMatches);
    printf("INFO: Post processing complete.\n");
    printf(nMatches == 1 ? "PASS: " : "FAIL: ");
    printf("Got expected number of matches.\n");

    for (nIndex = 0; nIndex < nMatches; nIndex++)
    {
        EdkRetrieveMatch(pMatchSet, nIndex, &pMatch);
        /* print out match info using match info accessors */
        EdkMatchGetEntityName(pMatch, &szEntityName);
        EdkMatchGetMatchedTextOffset(pMatch, &origOffset);
        EdkMatchGetMatchedTextOffsetLength(pMatch, &offsetLength);
        EdkMatchGetScore(pMatch, &score);
        EdkMatchGetNormalizedTextSize(pMatch, &textSize);
        EdkMatchGetNormalizedTextLength(pMatch, &textLength);
        EdkMatchGetMatchedTextSize(pMatch, &origSize);
        EdkMatchGetMatchedTextLength(pMatch, &origLength);
        EdkMatchGetMatchedText(pMatch, &szOrigText);
        EdkMatchGetNormalizedText(pMatch, &szEntityText);
        printf("INFO:   EntityName=\"%s\" Offset=\"%u\" OffsetLength=\"%u\"\n",         szEntityName, origOffset, offsetLength);
        printf("INFO:   Score=\"%04.2f\" NormalizedTextSize=\"%u\"         NormalizedTextLength=\"%u\"\n", score, textSize, textLength);
        printf("INFO:   OriginalTextSize=\"%u\" OriginalTextLength=\"%u\"\n",         origSize, origLength);
        printf("INFO:   Original Text=\"%s\"\n", szOrigText);
        printf("INFO:   Normalized Text=\"%s\"\n", szEntityText);
        EdkMatchGetComponentCount(pMatch, &nComponents);
        for (nComponent = 0; nComponent < nComponents; ++nComponent) 
        {
            EdkMatchComponentHandle pComponent = NULL;
            EdkMatchGetComponentHandle(pMatch, nComponent, &pComponent);

            EdkMatchComponentGetName(pComponent, &szEntityName);
            EdkMatchComponentGetMatchedTextOffset(pComponent, &origOffset);
            EdkMatchComponentGetMatchedTextLength(pComponent, &offsetLength);
            EdkMatchComponentGetSize(pComponent, &origSize);
            EdkMatchComponentGetLength(pMatch, pComponent, &origLength);
            strncpy(componentText, szEntityText + origOffset, origSize);
            *(componentText + origSize) = '\0';
            printf( "INFO:     Component Name=\"%s\" Text=\"%s\"\n",             szEntityName, componentText);
            printf( "INFO:     Offset=\"%u\" OffsetLength=\"%u\" TextSize=\"%u\"             TextLength=\"%u\"\n", origOffset, offsetLength, origSize,             origLength);
        }
        printf("INFO: Post processing complete.\n");
        printf(score == 5.00 ? "PASS: " : "FAIL: ");
        printf("Got expected score for match.\n");
        printf(!strcmp("Alan Turing", szOrigText) ? "PASS: " : "FAIL: ");
        printf("Got expected text for match.\n");
    }

    // Destroy the post-processing things, plus the match set
    EdkPostProcessorTaskDestroy(pTask);
    EdkPostProcessorTasksCollectionDestroy(pTaskSet);
    EdkPostProcessorDestroy(pProcessor);
    EdkDestroyMatches(pMatchSet);

    // Destroy the session handle and release the resource
    EdkSessionDestroy(pSession);

    // Ensure that all session handles have been destroyed before calling this
    EdkEngineDestroy(pEngine);

    printf("PASS: Program completed without an error.\n");
    return 0;
}

_HP_HTML5_bannerTitle.htm