The postprocess.c program builds upon basic.c and shows the work flow needed to support post-processing in eduction. The program:
#include <stdlib.h> #include <stdio.h> #include <sys/types.h> #include <sys/stat.h> #include <malloc.h> #include <edk.h> #include <string.h> #ifdef _WIN32 #define stat _stat #define off_t _off_t #endif // _WIN32 #define BUFLEN 5120 // Helper function void displayusageinfo() { EDK_VERSION_INFO versionInfo; EdkGetVersion(&versionInfo); if (versionInfo.vChangeSet) printf("INFO: Eduction SDK Sample for SDK version v%s.%i\n", versionInfo.versionString, versionInfo.vChangeSet); else printf("INFO: Eduction SDK Sample for SDK version v%s\n", versionInfo.versionString); printf("INFO: SDK Built: %s\n", versionInfo.buildTime); printf("INFO: Copyright %s\n", versionInfo.copyright); printf("INFO: Usage: sample1 <grammarpath> <entity> <documentpath> <licensepath>\n"); printf("INFO: Parameters:\n"); printf("INFO: <grammarpath> Path to the grammar file that defines matchable\n"); printf("INFO: entities. The grammar file can be in uncompiled (XML)\n"); printf("INFO: or compiled (ECR) format.\n"); printf("INFO: <entities> Comma deliminted list of entities in the grammar file\n"); printf("INFO to be used for matching.\n"); printf("INFO: <documentpath> Path to the document to be parsed.\n"); printf("INFO: <licensepath> Path to the license file to be used.\n"); } // Helper function int fileExists(const char * const szFileName) { struct stat buf; int exists; if (!szFileName) return 0; exists = stat(szFileName, &buf) == 0; return exists; } // Helper function int checkargs(const int argc, char **argv) { if (argc != 5) { printf("FAIL: Program requires four arguments.\n"); displayusageinfo(); return 0; } printf("INFO: Grammar Path: %s\n", argv[1]); printf("INFO: Entities: %s\n", argv[2]); printf("INFO: Document Path: %s\n", argv[3]); printf("INFO: License Path: %s\n", argv[4]); if (!fileExists(argv[1])) { printf("FAIL: Grammar path does not exist.\n"); return 0; } if (!fileExists(argv[3])) { printf("FAIL: Document path does not exist.\n"); return 0; } if (!fileExists(argv[4])) { printf("FAIL: License path does not exist.\n"); return 0; } return 1; } // Helper function char *readFile(const char * const fn) { struct stat fnInfo; off_t len; FILE *f; char *buf; size_t itemsRead; if (stat(fn, &fnInfo)) { printf("FAIL: Unable to get file size for \"%s\".\n", fn); return NULL; } len = fnInfo.st_size; if (!len) { printf("FAIL: Zero byte file size for \"%s\".\n", fn); return NULL; } f = fopen(fn, "rb"); if (!f) { printf("FAIL: Unable to open file \"%s\".\n", fn); return NULL; } buf = (char*)malloc(len+1); itemsRead = fread(buf, 1, len, f); *(buf+len) = '\0'; fclose(f); if (itemsRead < (size_t)len) { free(buf); printf("FAIL: Unable to read \"%s\".\n", fn); return NULL; } return buf; } // Helper function int readFirst(const char * const fn, FILE **f, char * const buf, const size_t bufLen, off_t * const fileSize, size_t * const bytesRead) { struct stat fnInfo; size_t itemsToRead, itemsRead; if (stat(fn, &fnInfo)) { printf("FAIL: Unable to get file size for \"%s\".\n", fn); return 0; } *fileSize = fnInfo.st_size; if (!*fileSize) { printf("FAIL: Zero byte file size for \"%s\".\n", fn); return 0; } *f = fopen(fn, "rb"); if (!*f) { printf("FAIL: Unable to open file \"%s\".\n", fn); return 0; } itemsToRead = (off_t)bufLen < *fileSize ? bufLen : (size_t)*fileSize; itemsRead = fread(buf, 1, itemsToRead, *f); if (itemsRead < itemsToRead) { fclose(*f); *f = NULL; printf("FAIL: Unable to read \"%s\".\n", fn); return 0; } *bytesRead = itemsRead; return 1; } // Helper function int readNext(const char * const fn, FILE *f, char * const buf, const size_t bufLen, const size_t bytesRemaining, size_t * const bytesRead) { size_t itemsToRead = bufLen < bytesRemaining ? bufLen : bytesRemaining; size_t itemsRead; if (!itemsToRead) { *bytesRead = 0; return 1; } itemsRead = fread(buf, 1, itemsToRead, f); if (itemsRead < itemsToRead) { printf("FAIL: Unable to continue reading \"%s\".\n", fn); return 0; } *bytesRead = itemsRead; return 1; } // Main function int main(int argc, char ** argv) { int32_t nErrCode; EdkEngineHandle pEngine; char *license; const char* szErrorMsg; EdkSessionHandle pSession; size_t bytesRead, bytesRemaining; FILE *f; char buf[BUFLEN], componentText[128]; off_t fileSize; const char *szEntityName, *szEntityText, *szOrigText; size_t textSize, textLength, origSize, origLength, origOffset, offsetLength; double score; size_t nComponents, nComponent; EdkPostProcessTaskHandle pTask = NULL; EdkPostProcessTasksCollectionHandle pTaskSet = NULL; EdkPostProcessorHandle pProcessor = NULL; EdkMatchesCollectionHandle pMatchSet = NULL; EdkMatchHandle pMatch = NULL; size_t nMatches = 0; size_t nIndex = 0; printf("INFO: Program loaded.\n"); if (!checkargs(argc, argv)) return -1; printf("INFO: Parameters valid.\n"); if (( nErrCode = EdkEngineCreate( &pEngine )) != EdkSuccess) { printf("Unable to create the EDK Engine. Error code: %d\n" , nErrCode); return -1; } printf("INFO: Engine created.\n"); license = readFile(argv[4]); if (!license) { EdkEngineDestroy( pEngine ); return -1; } printf("INFO: License read.\n"); // Set the license to the eduction engine if ((EdkSetLicenseKey( pEngine, license) != EdkSuccess)) { nErrCode = EdkGetLastEngineError(pEngine, &szErrorMsg); printf("FAIL: %s (%d)\n" , szErrorMsg, nErrCode); free(license); EdkEngineDestroy(pEngine); return -1; } free(license); printf("INFO: License validated.\n"); // Configure the eduction engine // Settings include: // EnableComponents // EnableUniqueMatches // MaxMatchLength // MaxMatchesPerDoc // MatchWholeWord // TokenWithPunctuation // AllowOverlaps // AllowMultipleResults // MatchCases // Locale EdkSetEnableComponents(pEngine, true); // Load resource file // Call this function repeatedly until all required resource files are loaded if ((EdkLoadResourceFile(pEngine, argv[1]) != EdkSuccess)) { nErrCode = EdkGetLastEngineError(pEngine, &szErrorMsg); printf("FAIL: %s (%d)\n" , szErrorMsg, nErrCode); EdkEngineDestroy(pEngine); return -1; } printf("INFO: Resource file loaded.\n"); // Add an entity to match to against // Call this function repeatedly to add all desired entities // The entities to be added must be defined in the resource files added above if ((EdkAddTargetEntity(pEngine, argv[2]) != EdkSuccess)) { nErrCode = EdkGetLastEngineError(pEngine, &szErrorMsg); printf("FAIL: %s (%d)\n", szErrorMsg, nErrCode); EdkEngineDestroy(pEngine); return -1; } printf("INFO: Grammar(s) initialized.\n"); /* Set up post processing stuff */ if ((nErrCode = EdkPostProcessorTaskCreate("test_task", "scripts/turing.lua", argv[2], false, &pTask)) != EdkSuccess) { printf("Unable to create post processing task. Error code: %d\n" , nErrCode); EdkEngineDestroy(pEngine); return -1; } printf("INFO: Post process task test_task, with script scripts/turing.lua, created.\n"); if ((nErrCode = EdkPostProcessorTasksCollectionCreate(&pTaskSet)) != EdkSuccess) { printf("Unable to create post processing task collection. Error code: %d\ n" , nErrCode); EdkPostProcessorTaskDestroy(pTask); EdkEngineDestroy(pEngine); return -1; } if ((nErrCode = EdkPostProcessorTasksCollectionAddTask(pTaskSet, pTask)) != EdkSuccess) { printf("Unable to add post processing task to collection. Error code: %d\ n" , nErrCode); EdkPostProcessorTaskDestroy(pTask); EdkPostProcessorTasksCollectionDestroy(pTaskSet); EdkEngineDestroy(pEngine); return -1; } printf("INFO: Post process task list created.\n"); if ((nErrCode = EdkPostProcessorCreate(pTaskSet, &pProcessor)) != EdkSuccess) { printf("Unable to add post processing task to collection. Error code: %d\ n" , nErrCode); EdkPostProcessorTaskDestroy(pTask); EdkPostProcessorTasksCollectionDestroy(pTaskSet); EdkEngineDestroy(pEngine); return -1; } printf("INFO: Post processor object initialized.\n"); // Create an eduction session associated with this eduction engine // Multiple sessions can be created and concurrent processing in multithreaded applications if (EdkSessionCreate(pEngine, &pSession) != EdkSuccess) { nErrCode = EdkGetLastEngineError(pEngine, &szErrorMsg); printf("FAIL: %s (%d)\n" , szErrorMsg, nErrCode); EdkEngineDestroy(pEngine); return -1; } printf("INFO: Session created.\n"); if (!readFirst(argv[3], &f, buf, BUFLEN, &fileSize, &bytesRead)) { EdkSessionDestroy(pSession); EdkEngineDestroy(pEngine); return -1; } bytesRemaining = (size_t)fileSize - bytesRead; printf("INFO: Data file opened and %d byte block reads initiated.\n", BUFLEN); // Add input data // EdkAddInputText is called repeatedly for as many times as needed until all the input has been exhausted // The input data must be UTF-8 encoded. // Note: An alternative method of adding input data is to create a data input stream while (bytesRead) { printf("INFO: Adding data block to engine.\n"); if ((EdkAddInputText( pSession, buf, bytesRead, bytesRemaining ? false : true)) != EdkSuccess) { fclose(f); nErrCode = EdkGetLastSessionError(pSession, &szErrorMsg); printf("FAIL: %s (%d)\n" , szErrorMsg, nErrCode); EdkSessionDestroy(pSession); EdkEngineDestroy(pEngine); return -1; } printf("INFO: Data block added.\n"); if(!readNext(argv[3], f, buf, BUFLEN, bytesRemaining, &bytesRead)) { fclose(f); EdkSessionDestroy(pSession); EdkEngineDestroy(pEngine); return -1; } bytesRemaining -= bytesRead; } fclose(f); /* Get all matches found from the input and process them */ if ((nErrCode = EdkFillMatches(pSession, &pMatchSet)) != EdkSuccess) { printf("Unable to retrieve matches from current session. Error code: %d\ n" , nErrCode); EdkPostProcessorTaskDestroy(pTask); EdkPostProcessorTasksCollectionDestroy(pTaskSet); EdkEngineDestroy(pEngine); return -1; } printf("INFO: Running post-processor on match set.\n"); EdkPostProcessorRun(pProcessor, pMatchSet); EdkGetNumMatches(pMatchSet, &nMatches); printf("INFO: Post processing complete.\n"); printf(nMatches == 1 ? "PASS: " : "FAIL: "); printf("Got expected number of matches.\n"); for (nIndex = 0; nIndex < nMatches; nIndex++) { EdkRetrieveMatch(pMatchSet, nIndex, &pMatch); /* print out match info using match info accessors */ EdkMatchGetEntityName(pMatch, &szEntityName); EdkMatchGetMatchedTextOffset(pMatch, &origOffset); EdkMatchGetMatchedTextOffsetLength(pMatch, &offsetLength); EdkMatchGetScore(pMatch, &score); EdkMatchGetNormalizedTextSize(pMatch, &textSize); EdkMatchGetNormalizedTextLength(pMatch, &textLength); EdkMatchGetMatchedTextSize(pMatch, &origSize); EdkMatchGetMatchedTextLength(pMatch, &origLength); EdkMatchGetMatchedText(pMatch, &szOrigText); EdkMatchGetNormalizedText(pMatch, &szEntityText); printf("INFO: EntityName=\"%s\" Offset=\"%u\" OffsetLength=\"%u\"\n", szEntityName, origOffset, offsetLength); printf("INFO: Score=\"%04.2f\" NormalizedTextSize=\"%u\" NormalizedTextLength=\"%u\"\n", score, textSize, textLength); printf("INFO: OriginalTextSize=\"%u\" OriginalTextLength=\"%u\"\n", origSize, origLength); printf("INFO: Original Text=\"%s\"\n", szOrigText); printf("INFO: Normalized Text=\"%s\"\n", szEntityText); EdkMatchGetComponentCount(pMatch, &nComponents); for (nComponent = 0; nComponent < nComponents; ++nComponent) { EdkMatchComponentHandle pComponent = NULL; EdkMatchGetComponentHandle(pMatch, nComponent, &pComponent); EdkMatchComponentGetName(pComponent, &szEntityName); EdkMatchComponentGetMatchedTextOffset(pComponent, &origOffset); EdkMatchComponentGetMatchedTextLength(pComponent, &offsetLength); EdkMatchComponentGetSize(pComponent, &origSize); EdkMatchComponentGetLength(pMatch, pComponent, &origLength); strncpy(componentText, szEntityText + origOffset, origSize); *(componentText + origSize) = '\0'; printf( "INFO: Component Name=\"%s\" Text=\"%s\"\n", szEntityName, componentText); printf( "INFO: Offset=\"%u\" OffsetLength=\"%u\" TextSize=\"%u\" TextLength=\"%u\"\n", origOffset, offsetLength, origSize, origLength); } printf("INFO: Post processing complete.\n"); printf(score == 5.00 ? "PASS: " : "FAIL: "); printf("Got expected score for match.\n"); printf(!strcmp("Alan Turing", szOrigText) ? "PASS: " : "FAIL: "); printf("Got expected text for match.\n"); } // Destroy the post-processing things, plus the match set EdkPostProcessorTaskDestroy(pTask); EdkPostProcessorTasksCollectionDestroy(pTaskSet); EdkPostProcessorDestroy(pProcessor); EdkDestroyMatches(pMatchSet); // Destroy the session handle and release the resource EdkSessionDestroy(pSession); // Ensure that all session handles have been destroyed before calling this EdkEngineDestroy(pEngine); printf("PASS: Program completed without an error.\n"); return 0; }
|