diff --git a/catdoc.go b/catdoc.go index 59b16c5..000cea3 100644 --- a/catdoc.go +++ b/catdoc.go @@ -102,6 +102,14 @@ func GetCommentsFromFile(file io.ReadSeeker) (string, error) { return callWASMFuncWithFile("get_comments", file) } +func GetAnnotationAuthorsFromFile(file io.ReadSeeker) ([]string, error) { + r, err := callWASMFuncWithFile("get_annotation_authors", file) + if err != nil { + return nil, err + } + return strings.Split(r, "\n"), nil +} + func GetVersion() (string, error) { return callWASMFunc("get_version", nil) } diff --git a/catdoc.wasm b/catdoc.wasm index 27f960d..b9faf81 100755 Binary files a/catdoc.wasm and b/catdoc.wasm differ diff --git a/catdoc/src/Makefile b/catdoc/src/Makefile index 52d6688..d2655f7 100644 --- a/catdoc/src/Makefile +++ b/catdoc/src/Makefile @@ -105,7 +105,7 @@ catdoc-wasm: emsdk.uptodate -I. -O3 -DHAVE_CONFIG_H \ -DCATDOC_VERSION=\"0.95\" \ -DCHARSETPATH=\"charsets\" \ - -sEXPORTED_FUNCTIONS=_get_text,_get_author,_get_last_author,_get_version,_get_title,_get_subject,_get_keywords,_get_comments\ + -sEXPORTED_FUNCTIONS=_get_text,_get_author,_get_last_author,_get_version,_get_title,_get_subject,_get_keywords,_get_comments,_get_annotation_authors\ -sSTANDALONE_WASM -sWARN_ON_UNDEFINED_SYMBOLS=0 \ --no-entry -sFILESYSTEM=1 -sALLOW_MEMORY_GROWTH -sMAXIMUM_MEMORY=1GB diff --git a/catdoc/src/analyze.c b/catdoc/src/analyze.c index 1ecdd9d..fa25c92 100644 --- a/catdoc/src/analyze.c +++ b/catdoc/src/analyze.c @@ -18,6 +18,9 @@ extern char ole_sign[], zip_sign[]; /* from ole.c */ char rtf_sign[] = "{\\rtf"; char old_word_sign[] = {0xdb, 0xa5, 0}; char write_sign[] = {0x31, 0xBE, 0}; +unsigned char sttbfRMark[] = {0x00, 0x00, 0x07, 0x00, 0x55, 0x00, + 0x6E, 0x00, 0x6B, 0x00, 0x6E, 0x00, + 0x6F, 0x00, 0x77, 0x00, 0x6E, 0x00}; int verbose = 0; unsigned char *read_metadata(unsigned char *buffer, metadata metadata_type) { @@ -38,6 +41,64 @@ unsigned char *read_metadata(unsigned char *buffer, metadata metadata_type) { return NULL; } +int find_offset(FILE *f, unsigned char *mark, int mark_length) { + int offset = 0; + int block_size = 1024 * 25; + int pos = 0; + int bpos = 0; + unsigned char buf[block_size]; + int block_n = 0; + + while (!feof(f)) { + long n = fread(buf, 1, block_size, f); + while (bpos != n) { + if (buf[bpos] == mark[pos]) { + pos++; + bpos++; + if (pos == mark_length) { + offset = block_n * block_size + bpos - pos; + return offset; + } + } else { + bpos -= pos; + bpos++; + pos = 0; + } + } + bpos = 0; + block_n++; + } + fprintf(stderr, "stttbfRMark offset is not found"); + exit(1); +} + +void read_annotation_authors(FILE *f) { + int offset = find_offset(f, sttbfRMark, 18) - 4; + fseek(f, offset, SEEK_SET); + + int block_size = 1024; + unsigned char buf[2]; + fread(buf, 1, 2, f); + if (buf[0] != 0xff || buf[1] != 0xff) { + fprintf(stderr, "stttbfRMark offset is invalid"); + exit(1); + } + fread(buf, 1, 2, f); + unsigned int count = getshort(buf, 0) - 1; + fseek(f, 18, SEEK_CUR); + for (int i = 0; i < count; i++) { + fread(buf, 1, 2, f); + unsigned int str_len = getshort(buf, 0); + unsigned short *str = calloc(str_len, 2); + fread(str, 2, str_len, f); + for (int j = 0; j < str_len; j++) { + printf("%lc", str[j]); + } + printf("\n"); + free(str); + } +} + /********************************************************************* * Determines format of input file and calls parse_word_header or * process_file if @@ -45,6 +106,10 @@ unsigned char *read_metadata(unsigned char *buffer, metadata metadata_type) { * return not 0 when error ********************************************************************/ int analyze_format(FILE *f, metadata metadata_type) { + if (metadata_type == annotation_authors) { + read_annotation_authors(f); + return 0; + } unsigned char buffer[129]; long offset = 0; FILE *new_file, *ole_file; diff --git a/catdoc/src/catdoc.c b/catdoc/src/catdoc.c index 3d83926..3709d44 100644 --- a/catdoc/src/catdoc.c +++ b/catdoc/src/catdoc.c @@ -56,7 +56,7 @@ int main(int argc, char **argv) { get_locale_charset(); #endif metadata metadata_type = none; - while ((c = getopt(argc, argv, "Vls:d:f:taubxv8wALTSKCm:")) != -1) { + while ((c = getopt(argc, argv, "Vls:d:f:taubxv8wALTSKCUm:")) != -1) { switch (c) { case 's': check_charset(&source_csname, optarg); @@ -104,6 +104,9 @@ int main(int argc, char **argv) { case 'C': metadata_type = comments; break; + case 'U': + metadata_type = annotation_authors; + break; case 'm': { char *endptr; wrap_margin = (int)strtol(optarg, &endptr, 0); @@ -246,3 +249,8 @@ void get_comments() { char *args[] = {"", "-C", "/input_file/file.doc"}; main(3, args); } + +void get_annotation_authors() { + char *args[] = {"", "-U", "/input_file/file.doc"}; + main(3, args); +} diff --git a/catdoc/src/catdoc.h b/catdoc/src/catdoc.h index 0a8f932..826b82d 100644 --- a/catdoc/src/catdoc.h +++ b/catdoc/src/catdoc.h @@ -227,6 +227,7 @@ void get_title(); void get_subject(); void get_keywords(); void get_comments(); +void get_annotation_authors(); char *find_file(char *name, const char *path); char *stradd(const char *s1, const char *s2); @@ -255,7 +256,8 @@ typedef enum { title, subject, keywords, - comments + comments, + annotation_authors } metadata; int analyze_format(FILE *f, metadata metadata_type); void list_charsets(void); diff --git a/catdoc_test.go b/catdoc_test.go index 5c8ff69..7e3e14c 100644 --- a/catdoc_test.go +++ b/catdoc_test.go @@ -31,6 +31,22 @@ func testFileFunc(title, expected string, fun func(io.ReadSeeker) (string, error } } +func testFileFuncArr(title string, expected []string, fun func(io.ReadSeeker) ([]string, error), t *testing.T) { + f, err := os.Open("test.doc") + if err != nil { + t.Fatalf("could not open test document, %v", err) + } + arr, err := fun(f) + if err != nil { + t.Fatalf("expected error to be nil, got %v", err) + } + for i := range expected { + if arr[i] != expected[i] { + t.Fatalf("expected %s to be \"%v\", got %v", title, expected, arr) + } + } +} + func TestGetTextFromFile(t *testing.T) { testFileFunc("text", "text-inside-doc", gocatdoc.GetTextFromFile, t) } @@ -58,3 +74,7 @@ func TestGetKeywordsFromFile(t *testing.T) { func TestGetCommentsFromFile(t *testing.T) { testFileFunc("comments", "Comments", gocatdoc.GetCommentsFromFile, t) } + +func TestGetAnnotationAuthorsFromFile(t *testing.T) { + testFileFuncArr("annoation_authors", []string{"H. Potter"}, gocatdoc.GetAnnotationAuthorsFromFile, t) +} diff --git a/test.doc b/test.doc index 918bea8..16c96d7 100644 Binary files a/test.doc and b/test.doc differ