cjknormalization.c

The cjknormalization.c program builds upon basic.c and shows how you can perform normalization of Chinese, Japanese, and Korean input text before you process it in an eduction session.

#include <stdlib.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <malloc.h>
#include <edk.h>
#include <string.h>

#ifdef _WIN32
#define stat _stat
#define off_t _off_t
#endif // _WIN32
#define BUFLEN 5120

// Helper function
void displayusageinfo() {
    EDK_VERSION_INFO versionInfo;
    EdkGetVersion(&versionInfo);
    if (versionInfo.vChangeSet)
        printf("INFO: Eduction SDK Sample for SDK version v%s.%i\n", versionInfo.versionString, versionInfo.vChangeSet);
    else
        printf("INFO: Eduction SDK Sample for SDK version v%s\n", versionInfo.versionString);
    printf("INFO: SDK Built: %s\n", versionInfo.buildTime);
    printf("INFO: Copyright %s\n", versionInfo.copyright);
    printf("INFO: Usage: cjknormalization.exe <grammarpath> <documentpath> <licensepath>\n");
    printf("INFO: Parameters:\n");
    printf("INFO:     <grammarpath>   Path to the grammar file to be used.\n");
    printf("INFO:     <documentpath>  Path to the document to be parsed.\n");
    printf("INFO:     <licensepath>   Path to the license file to be used.\n");
}

// Helper function
int fileExists(const char * const szFileName) {
    struct stat buf;
    int exists;
    if (!szFileName)
        return 0;
    exists = stat(szFileName, &buf) == 0;
    return exists;
}

// Helper function
int checkargs(const int argc, char **argv) {
    if (argc != 4) {
        printf("FAIL: Program requires four arguments.\n");
        displayusageinfo();
        return 0;
    }
    printf("INFO: Grammar Path:  %s\n", argv[1]);
    printf("INFO: Document Path: %s\n", argv[2]);
    printf("INFO: License Path:  %s\n", argv[3]);
    if (!fileExists(argv[1])) {
        printf("FAIL: Grammar path does not exist.\n");
        return 0;
    }
    if (!fileExists(argv[2])) {
        printf("FAIL: Document path does not exist.\n");
        return 0;
    }
    if (!fileExists(argv[3])) {
        printf("FAIL: License path does not exist.\n");
        return 0;
    }
    return 1;
}

// Helper function
char *readFile(const char * const fn) {
    struct stat fnInfo;
    off_t len;
    FILE *f;
    char *buf;
    size_t itemsRead;
    if (stat(fn, &fnInfo)) {
        printf("FAIL: Unable to get file size for \"%s\".\n", fn);
        return NULL;
    }
    len = fnInfo.st_size;
    if (!len) {
        printf("FAIL: Zero byte file size for \"%s\".\n", fn);
        return NULL;
    }
    f = fopen(fn, "rb");
    if (!f) {
        printf("FAIL: Unable to open file \"%s\".\n", fn);
        return NULL;
    }
    buf = (char*)malloc(len+1);
    itemsRead = fread(buf, 1, len, f);
    *(buf+len) = '\0';
    fclose(f);
    if (itemsRead < (size_t)len) {
        free(buf);
        printf("FAIL: Unable to read \"%s\".\n", fn);
        return NULL;
    }
    return buf;
}

// Main function
int main(int argc, char ** argv)
{
    int32_t nErrCode;
    EdkEngineHandle pEngine;
    char *license;
    const char* szErrorMsg;
    EdkSessionHandle pSession;
    char* buf = NULL;
    off_t fileSize;
    const char *szEntityName, *szEntityText, *szOrigText;
    size_t textSize, textLength, origSize, origLength, origOffset, offsetLength;
    double score;
    const char* szCJKNormalizedText = NULL;
    const char* szCJKNormalizationOptions = "HWNum,HWAlpha";

    printf("INFO: Program loaded.\n");

    if (!checkargs(argc, argv))
        return -1;
    printf("INFO: Parameters valid.\n");

    if (( nErrCode = EdkEngineCreate( &pEngine )) != EdkSuccess) {
        printf("Unable to create the EDK Engine. Error code: %d\n" , nErrCode);
        return -1;
    }
     printf("INFO: Engine created.\n");

    license = readFile(argv[3]);
    if (!license) {
        EdkEngineDestroy( pEngine );
        return -1;
    }
    printf("INFO: License read.\n");

    // Set the license to the eduction engine
    if ((EdkSetLicenseKey( pEngine, license) != EdkSuccess)) {
        nErrCode = EdkGetLastEngineError(pEngine, &szErrorMsg);
        printf("FAIL: %s (%d)\n" , szErrorMsg, nErrCode);
        free(license);
        EdkEngineDestroy(pEngine);
          return -1;
    }
    free(license);
    printf("INFO: License validated.\n");

    // Load resource file
    // Call this function repeatedly until all required resource files are loaded
    if ((EdkLoadResourceFile(pEngine, argv[1]) != EdkSuccess)) {
        nErrCode = EdkGetLastEngineError(pEngine, &szErrorMsg);
        printf("FAIL: %s (%d)\n" , szErrorMsg, nErrCode);
        EdkEngineDestroy(pEngine);
        return -1;
    }
    printf("INFO: Resource file loaded.\n");

    // Add an entity to match to against
    // Call this function repeatedly to add all desired entities
    // The entities to be added must be defined in the resource files added above
    if ((EdkAddTargetEntity(pEngine, "e/e") != EdkSuccess)) {
        nErrCode = EdkGetLastEngineError(pEngine, &szErrorMsg);
        printf("FAIL: %s (%d)\n", szErrorMsg, nErrCode);
        EdkEngineDestroy(pEngine);
        return -1;
    }
    printf("INFO: Grammar(s) initialized.\n");

    // Create an eduction session assosicated with this eduction engine
    // Multiple sessions can be created and concurrent processing in multithreaded applications
    if (EdkSessionCreate(pEngine, &pSession) != EdkSuccess) {
        nErrCode = EdkGetLastEngineError(pEngine, &szErrorMsg);
        printf("FAIL: %s (%d)\n" , szErrorMsg, nErrCode);
        EdkEngineDestroy(pEngine);
        return -1;
    }
    printf("INFO: Session created.\n");

    buf = readFile(argv[2]);
    if (!buf) {
        EdkSessionDestroy(pSession);
        EdkEngineDestroy(pEngine);
        return -1;
    }
    printf("INFO: Data file opened and %d byte block reads initiated.\n", BUFLEN);

    if (EdkCJKNormalizeText(pEngine, buf, &szCJKNormalizedText, szCJKNormalizationOptions) != EdkSuccess) {
        nErrCode = EdkGetLastEngineError(pEngine, &szErrorMsg);
        printf("FAIL: %s (%d)\n" , szErrorMsg, nErrCode);
        EdkSessionDestroy(pSession);
        EdkEngineDestroy(pEngine);
        return -1;
    }
    printf("INFO: Input buffer normalized.\n");

    // Add input data
    // EdkAddInputText is called repeatedly for as many times as needed until all the input has been exhausted
    // The input data must be UTF-8 encoded.
    // Note: An alternative method of adding input data is to create a data input stream
    printf("INFO: Adding data block to engine.\n");
    if ((EdkAddInputText( pSession, szCJKNormalizedText, strlen(szCJKNormalizedText), true)) != EdkSuccess) {
        nErrCode = EdkGetLastSessionError(pSession, &szErrorMsg);
        printf("FAIL: %s (%d)\n" , szErrorMsg, nErrCode);
        EdkSessionDestroy(pSession);
        EdkEngineDestroy(pEngine);
        return -1;
    }
    printf("INFO: Data block added.\n");

    // Get a match
    // This is called repeatedly to get all matches
    while (EdkGetNextMatch(pSession) == EdkSuccess) {
        // While we have a match, obtain all required information about the match
        EdkGetMatchEntityName(pSession, &szEntityName);
        EdkGetMatchOrigOffset(pSession, &origOffset);
        EdkGetMatchOrigOffsetLength(pSession, &offsetLength);
        EdkGetMatchScore(pSession, &score);
        EdkGetMatchTextSize(pSession, &textSize);
        EdkGetMatchTextLength(pSession, &textLength);
        EdkGetMatchOrigSize(pSession, &origSize);
        EdkGetMatchOrigLength(pSession, &origLength);
        EdkGetMatchOrigText(pSession, &szOrigText);
        EdkGetMatchText(pSession, &szEntityText);
        printf("INFO: EntityName=\"%s\" Offset=\"%u\" OffsetLength=\"%u\"\n", szEntityName, origOffset, offsetLength);
        printf("INFO:   Score=\"%04.2f\" NormalizedTextSize=\"%u\" NormalizedTextLength=\"%u\"\n", score, textSize, textLength);
        printf("INFO:   OriginalTextSize=\"%u\" OriginalTextLength=\"%u\"\n", origSize, origLength);
        printf("INFO:   Original Text=\"%s\"\n", szOrigText);
        printf("INFO:   Normalized Text=\"%s\"\n", szEntityText);
    }
    printf("INFO: Matching on block complete.\n");

    nErrCode = EdkGetLastSessionError(pSession, &szErrorMsg);
    if ((nErrCode != EdkNoMatch)) {
        printf("FAIL: %s (%d)\n" , szErrorMsg, nErrCode);
        EdkSessionDestroy(pSession);
        EdkEngineDestroy(pEngine);
        return -1;
    }

    // Destroy the session handle and release the resource
    EdkSessionDestroy(pSession);

    // Ensure that all session handles have been destroyed before calling this
    EdkEngineDestroy(pEngine);

    printf("PASS: Program completed without an error.\n");
    return 0;
}

_HP_HTML5_bannerTitle.htm