Skip to content

Commit

Permalink
filesize is not provided by curl (#1871)
Browse files Browse the repository at this point in the history
Fix documentation for `filesize` is not provided by curl

See discussion at: curl/curl#13527

Calling curl with a file does not provide the `size` field for the file:

```sh
curl --trace-ascii debug.txt -F "file=@test.txt" "http://127.0.0.1:8080/fscrawler/_document"
```

Gives:

```txt
== Info:   Trying 127.0.0.1:8080...
== Info: Connected to 127.0.0.1 (127.0.0.1) port 8080
=> Send header, 224 bytes (0xe0)
0000: POST /fscrawler/_document?simulate=true HTTP/1.1
0032: Host: 127.0.0.1:8080
0048: User-Agent: curl/8.4.0
0060: Accept: */*
006d: Content-Length: 214
0082: Content-Type: multipart/form-data; boundary=--------------------
00c2: ----VzJBwyDNXJA2IVvgyzIvvA
00de:
=> Send data, 214 bytes (0xd6)
0000: --------------------------VzJBwyDNXJA2IVvgyzIvvA
0032: Content-Disposition: form-data; name="file"; filename="test.txt"
0074: Content-Type: text/plain
008e:
0090: This is my text.
00a2: --------------------------VzJBwyDNXJA2IVvgyzIvvA--
== Info: We are completely uploaded and fine
<= Recv header, 17 bytes (0x11)
0000: HTTP/1.1 200 OK
<= Recv header, 32 bytes (0x20)
0000: Content-Type: application/json
<= Recv header, 21 bytes (0x15)
0000: Content-Length: 489
<= Recv header, 2 bytes (0x2)
0000:
<= Recv data, 489 bytes (0x1e9)
0000: {.  "ok" : true,.  "filename" : "test.txt",.  "url" : "https://1
0040: 27.0.0.1:9200/rest/_doc/dd18bf3a8ea2a3e53e2661c7fb53534",.  "doc
0080: " : {.    "content" : "This is my text\n\n",.    "meta" : { },.
00c0:    "file" : {.      "extension" : "txt",.      "content_type" :
0100: "text/plain; charset=ISO-8859-1",.      "indexing_date" : "2024-
0140: 05-03T10:39:47.685+00:00",.      "filesize" : -1,.      "filenam
0180: e" : "test.txt".    },.    "path" : {.      "virtual" : "test.tx
01c0: t",.      "real" : "test.txt".    }.  }.}
== Info: Connection #0 to host 127.0.0.1 left intact
```

Important part is:

```txt
0000: --------------------------VzJBwyDNXJA2IVvgyzIvvA
0032: Content-Disposition: form-data; name="file"; filename="test.txt"
0074: Content-Type: text/plain
008e:
0090: This is my text.
00a2: --------------------------VzJBwyDNXJA2IVvgyzIvvA--
== Info: We are completely uploaded and fine
```

We can see that the `size` of the file is not provided.

But when calling the same endpoint using Java `jakarta.ws.rs.client` client, the `size` is provided:

```
1 > PUT http://127.0.0.1:8080/fscrawler/_document/1234
1 > Accept: multipart/form-data,application/json
1 > Content-Type: multipart/form-data
--Boundary_1_46114008_1714750065797
Content-Type: application/octet-stream
Content-Disposition: form-data; filename="test.txt"; modification-date="Fri, 03 May 2024 15:27:44 GMT"; size=30; name="file"

This file contains some words.
--Boundary_1_46114008_1714750065797--
```

The [RFC-2183](https://datatracker.ietf.org/doc/html/rfc2183#section-2.7) does not make this parameter mandatory.
So the workaround is to compute it from the CLI and send it as a tag:

```sh
echo "This is my text" > test.txt
curl -F "file=@test.txt" \
  -F "tags={\"file\":{\"filesize\":$(ls -l test.txt | awk '{print $5}')}}" \
  "http://127.0.0.1:8080/fscrawler/_document"
```

Related to #1868
  • Loading branch information
dadoonet committed May 13, 2024
1 parent 1b8b9ad commit 32113f9
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 6 deletions.
11 changes: 10 additions & 1 deletion docs/source/admin/fs/rest.rst
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ You will get back your document as it has been stored by elasticsearch:
}
}
If you started FSCrawler in debug mode with ``--debug`` or if you pass
If you started FSCrawler in debug mode or if you pass
``debug=true`` query parameter, then the response will be much more
complete:

Expand Down Expand Up @@ -279,6 +279,15 @@ The field ``external`` doesn't necessarily be a flat structure. This is a more a
}
}
You can use this technique to add for example the filesize of the file your are uploading::

.. code:: sh
echo "This is my text" > test.txt
curl -F "file=@test.txt" \
-F "tags={\"file\":{\"filesize\":$(ls -l test.txt | awk '{print $5}')}}" \
"http://127.0.0.1:8080/fscrawler/_document"
.. attention:: Only standard :ref:`FSCrawler fields <generated_fields>` can be set outside ``external`` field name.

Remove a document
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ public void testUploadDocumentWithIdUsingPut() throws Exception {
// We wait until we have our document
ESSearchResponse response = countTestHelper(new ESSearchRequest().withIndex(getCrawlerName()), 1L, null);
assertThat(response.getHits().get(0).getId(), is("1234"));
assertThat(JsonPath.read(response.getHits().get(0).getSource(), "$.file.filesize"), notNullValue());
assertThat(JsonPath.read(response.getHits().get(0).getSource(), "$.file.filesize"), greaterThan(0));
}

@Test
Expand Down Expand Up @@ -224,7 +224,14 @@ public void testAllDocumentsWithRestExternalIndex() throws Exception {
.timeValueMinutes(2));
for (ESSearchHit hit : response.getHits()) {
assertThat(JsonPath.read(hit.getSource(), "$.file.extension"), notNullValue());
assertThat(JsonPath.read(hit.getSource(), "$.file.filesize"), notNullValue());
int filesize = JsonPath.read(hit.getSource(), "$.file.filesize");
if (filesize <= 0) {
// On some machines (ie Github Actions), the size is not provided
logger.warn("File [{}] has a size of [{}]",
JsonPath.read(hit.getSource(), "$.file.filename"), filesize);
} else {
assertThat(JsonPath.read(hit.getSource(), "$.file.filesize"), greaterThan(0));
}
}
}

Expand All @@ -249,7 +256,7 @@ public void testDocumentWithExternalTags() throws Exception {
checkDocument("add_external.txt", hit -> {
assertThat(JsonPath.read(hit.getSource(), "$.content"), containsString("This file content will be extracted"));
assertThat(JsonPath.read(hit.getSource(), "$.file.extension"), notNullValue());
assertThat(JsonPath.read(hit.getSource(), "$.file.filesize"), notNullValue());
assertThat(JsonPath.read(hit.getSource(), "$.file.filesize"), greaterThan(0));
expectThrows(PathNotFoundException.class, () -> JsonPath.read(hit.getSource(), "$.meta"));
assertThat(JsonPath.read(hit.getSource(), "$.external.tenantId"), is(23));
assertThat(JsonPath.read(hit.getSource(), "$.external.company"), is("shoe company"));
Expand All @@ -265,7 +272,7 @@ public void testDocumentWithExternalTags() throws Exception {
checkDocument("replace_content_and_external.txt", hit -> {
assertThat(JsonPath.read(hit.getSource(), "$.content"), is("OVERWRITTEN CONTENT"));
assertThat(JsonPath.read(hit.getSource(), "$.file.extension"), notNullValue());
assertThat(JsonPath.read(hit.getSource(), "$.file.filesize"), notNullValue());
assertThat(JsonPath.read(hit.getSource(), "$.file.filesize"), greaterThan(0));
expectThrows(PathNotFoundException.class, () -> JsonPath.read(hit.getSource(), "$.meta"));
assertThat(JsonPath.read(hit.getSource(), "$.external.tenantId"), is(23));
assertThat(JsonPath.read(hit.getSource(), "$.external.company"), is("shoe company"));
Expand All @@ -281,7 +288,7 @@ public void testDocumentWithExternalTags() throws Exception {
checkDocument("replace_content_only.txt", hit -> {
assertThat(JsonPath.read(hit.getSource(), "$.content"), is("OVERWRITTEN CONTENT"));
assertThat(JsonPath.read(hit.getSource(), "$.file.extension"), notNullValue());
assertThat(JsonPath.read(hit.getSource(), "$.file.filesize"), notNullValue());
assertThat(JsonPath.read(hit.getSource(), "$.file.filesize"), greaterThan(0));
expectThrows(PathNotFoundException.class, () -> JsonPath.read(hit.getSource(), "$.meta"));
expectThrows(PathNotFoundException.class, () -> JsonPath.read(hit.getSource(), "$.external"));
});
Expand Down

0 comments on commit 32113f9

Please sign in to comment.