From d2de251c4d018c7d517d399d7d5e0e20d853972f Mon Sep 17 00:00:00 2001 From: Mark Felder Date: Tue, 29 Oct 2024 16:00:18 -0400 Subject: [PATCH 1/2] Pleroma.Upload.Filter.Dedupe: sharding directory structure Dedupe now uses a three-level sharding directory structure to improve performance when many files are uploaded and stored on a filesystem instead of an object store. (note: Minio still affected as it still uses a traditional filesystem) This does not help if you already have hundreds of thousands of files uploaded. The media URLs are permanently part of the activity so the files cannot be relocated. A motivated user could write a tool to move the files and perhaps write an Nginx or equivalent redirect to make the files still accessible, but that is beyond the scope of this change. --- changelog.d/dedupe-sharding.change | 1 + lib/pleroma/upload/filter/dedupe.ex | 10 +++++++++- test/pleroma/object_test.exs | 8 ++++---- test/pleroma/upload/filter/dedupe_test.exs | 4 +++- test/pleroma/upload_test.exs | 6 ++++-- 5 files changed, 21 insertions(+), 8 deletions(-) create mode 100644 changelog.d/dedupe-sharding.change diff --git a/changelog.d/dedupe-sharding.change b/changelog.d/dedupe-sharding.change new file mode 100644 index 000000000..2e140d8a2 --- /dev/null +++ b/changelog.d/dedupe-sharding.change @@ -0,0 +1 @@ +Dedupe upload filter now uses a three-level sharding directory structure diff --git a/lib/pleroma/upload/filter/dedupe.ex b/lib/pleroma/upload/filter/dedupe.ex index ef793d390..7b278d299 100644 --- a/lib/pleroma/upload/filter/dedupe.ex +++ b/lib/pleroma/upload/filter/dedupe.ex @@ -17,8 +17,16 @@ defmodule Pleroma.Upload.Filter.Dedupe do |> Base.encode16(case: :lower) filename = shasum <> "." <> extension - {:ok, :filtered, %Upload{upload | id: shasum, path: filename}} + + {:ok, :filtered, %Upload{upload | id: shasum, path: shard_path(filename)}} end def filter(_), do: {:ok, :noop} + + @spec shard_path(String.t()) :: String.t() + def shard_path( + <> = filename + ) do + Path.join([a, b, c, filename]) + end end diff --git a/test/pleroma/object_test.exs b/test/pleroma/object_test.exs index b3c528e32..ed5c2b6c8 100644 --- a/test/pleroma/object_test.exs +++ b/test/pleroma/object_test.exs @@ -174,8 +174,9 @@ defmodule Pleroma.ObjectTest do filename = Path.basename(href) - assert {:ok, files} = File.ls(uploads_dir) - assert filename in files + expected_path = Path.join([uploads_dir, Pleroma.Upload.Filter.Dedupe.shard_path(filename)]) + + assert File.exists?(expected_path) Object.delete(note) @@ -183,8 +184,7 @@ defmodule Pleroma.ObjectTest do assert Object.get_by_id(note.id).data["deleted"] assert Object.get_by_id(attachment.id) == nil - assert {:ok, files} = File.ls(uploads_dir) - refute filename in files + refute File.exists?(expected_path) end test "with objects that have legacy data.url attribute" do diff --git a/test/pleroma/upload/filter/dedupe_test.exs b/test/pleroma/upload/filter/dedupe_test.exs index 29c181509..cd5ce121b 100644 --- a/test/pleroma/upload/filter/dedupe_test.exs +++ b/test/pleroma/upload/filter/dedupe_test.exs @@ -23,10 +23,12 @@ defmodule Pleroma.Upload.Filter.DedupeTest do tempfile: Path.absname("test/fixtures/image_tmp.jpg") } + expected_path = Dedupe.shard_path(@shasum <> ".jpg") + assert { :ok, :filtered, - %Pleroma.Upload{id: @shasum, path: @shasum <> ".jpg"} + %Pleroma.Upload{id: @shasum, path: ^expected_path} } = Dedupe.filter(upload) end end diff --git a/test/pleroma/upload_test.exs b/test/pleroma/upload_test.exs index facb634c3..5fd62fa43 100644 --- a/test/pleroma/upload_test.exs +++ b/test/pleroma/upload_test.exs @@ -149,6 +149,9 @@ defmodule Pleroma.UploadTest do test "copies the file to the configured folder with deduping" do File.cp!("test/fixtures/image.jpg", "test/fixtures/image_tmp.jpg") + expected_filename = "e30397b58d226d6583ab5b8b3c5defb0c682bda5c31ef07a9f57c1c4986e3781.jpg" + + expected_path = Pleroma.Upload.Filter.Dedupe.shard_path(expected_filename) file = %Plug.Upload{ content_type: "image/jpeg", @@ -159,8 +162,7 @@ defmodule Pleroma.UploadTest do {:ok, data} = Upload.store(file, filters: [Pleroma.Upload.Filter.Dedupe]) assert List.first(data["url"])["href"] == - Pleroma.Upload.base_url() <> - "e30397b58d226d6583ab5b8b3c5defb0c682bda5c31ef07a9f57c1c4986e3781.jpg" + Path.join([Pleroma.Upload.base_url(), expected_path]) end test "copies the file to the configured folder without deduping" do From ebea518c8c9dea17ff18c8fa8192a7957adcfadb Mon Sep 17 00:00:00 2001 From: Lain Soykaf Date: Tue, 12 Nov 2024 12:43:16 +0400 Subject: [PATCH 2/2] B DedupeTest: Add explicit test for the sharding structure --- test/pleroma/upload/filter/dedupe_test.exs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/pleroma/upload/filter/dedupe_test.exs b/test/pleroma/upload/filter/dedupe_test.exs index cd5ce121b..4dc28b998 100644 --- a/test/pleroma/upload/filter/dedupe_test.exs +++ b/test/pleroma/upload/filter/dedupe_test.exs @@ -10,6 +10,10 @@ defmodule Pleroma.Upload.Filter.DedupeTest do @shasum "e30397b58d226d6583ab5b8b3c5defb0c682bda5c31ef07a9f57c1c4986e3781" + test "generates a shard path for a shasum" do + assert "e3/03/97/" <> _path = Dedupe.shard_path(@shasum) + end + test "adds shasum" do File.cp!( "test/fixtures/image.jpg",