pleroma/lib/pleroma/html.ex

# Pleroma: A lightweight social networking server
# Copyright © 2017-2020 Pleroma Authors <https://pleroma.social/>
# SPDX-License-Identifier: AGPL-3.0-only

defmodule Pleroma.HTML do
  # Scrubbers are compiled on boot so they can be configured in OTP releases
  #  @on_load :compile_scrubbers

  def compile_scrubbers do
    dir = Path.join(:code.priv_dir(:pleroma), "scrubbers")

    dir
    |> Pleroma.Utils.compile_dir()
    |> case do
      {:error, _errors, _warnings} ->
        raise "Compiling scrubbers failed"

      {:ok, _modules, _warnings} ->
        :ok
    end
  end

  defp get_scrubbers(scrubber) when is_atom(scrubber), do: [scrubber]
  defp get_scrubbers(scrubbers) when is_list(scrubbers), do: scrubbers
  defp get_scrubbers(_), do: [Pleroma.HTML.Scrubber.Default]

  def get_scrubbers do
    Pleroma.Config.get([:markup, :scrub_policy])
    |> get_scrubbers
  end

  def filter_tags(html, nil) do
    filter_tags(html, get_scrubbers())
  end

  def filter_tags(html, scrubbers) when is_list(scrubbers) do
    Enum.reduce(scrubbers, html, fn scrubber, html ->
      filter_tags(html, scrubber)
    end)
  end

  def filter_tags(html, scrubber) do
    {:ok, content} = FastSanitize.Sanitizer.scrub(html, scrubber)
    content
  end

  def filter_tags(html), do: filter_tags(html, nil)
  def strip_tags(html), do: filter_tags(html, FastSanitize.Sanitizer.StripTags)

  def get_cached_scrubbed_html_for_activity(
        content,
        scrubbers,
        activity,
        key \\ "",
        callback \\ fn x -> x end
      ) do
    key = "#{key}#{generate_scrubber_signature(scrubbers)}|#{activity.id}"

    Cachex.fetch!(:scrubber_cache, key, fn _key ->
      object = Pleroma.Object.normalize(activity)
      ensure_scrubbed_html(content, scrubbers, object.data["fake"] || false, callback)
    end)
  end

  def get_cached_stripped_html_for_activity(content, activity, key) do
    get_cached_scrubbed_html_for_activity(
      content,
      FastSanitize.Sanitizer.StripTags,
      activity,
      key,
      &HtmlEntities.decode/1
    )
  end

  def ensure_scrubbed_html(
        content,
        scrubbers,
        fake,
        callback
      ) do
    content =
      content
      |> filter_tags(scrubbers)
      |> callback.()

    if fake do
      {:ignore, content}
    else
      {:commit, content}
    end
  end

  defp generate_scrubber_signature(scrubber) when is_atom(scrubber) do
    generate_scrubber_signature([scrubber])
  end

  defp generate_scrubber_signature(scrubbers) do
    Enum.reduce(scrubbers, "", fn scrubber, signature ->
      "#{signature}#{to_string(scrubber)}"
    end)
  end

  def extract_first_external_url(_, nil), do: {:error, "No content"}

  def extract_first_external_url(object, content) do
    key = "URL|#{object.id}"

    Cachex.fetch!(:scrubber_cache, key, fn _key ->
      result =
        content
        |> Floki.parse_fragment!()
        |> Floki.filter_out("a.mention,a.hashtag,a[rel~=\"tag\"]")
        |> Floki.attribute("a", "href")
        |> Enum.at(0)

      {:commit, {:ok, result}}
    end)
  end
end
add license boilerplate to pleroma core 2018-12-23 12:04:54 -08:00			`# Pleroma: A lightweight social networking server`
Bump copyright years of files changed after 2020-01-07 Done via the following command: git diff fcd5dd259a1700a045be902b43391b0d1bd58a5b --stat --name-only \| xargs sed -i '/Pleroma Authors/c# Copyright © 2017-2020 Pleroma Authors <https:\/\/pleroma.social\/>' 2020-03-01 21:08:45 -08:00			`# Copyright © 2017-2020 Pleroma Authors <https://pleroma.social/>`
add license boilerplate to pleroma core 2018-12-23 12:04:54 -08:00			`# SPDX-License-Identifier: AGPL-3.0-only`

html: new module providing a configurable markup scrubbing policy 2018-09-09 16:29:00 -07:00			`defmodule Pleroma.HTML do`
HTML: Compile Scrubbers on boot This makes it possible to configure their behavior on OTP releases. 2019-12-08 08:42:40 -08:00			`# Scrubbers are compiled on boot so they can be configured in OTP releases`
			`# @on_load :compile_scrubbers`

			`def compile_scrubbers do`
			`dir = Path.join(:code.priv_dir(:pleroma), "scrubbers")`

			`dir`
Use Pleroma.Utils.compile_dir/1 in Pleroma.HTML.compile_scrubbers/0 2019-12-09 09:38:01 -08:00			`\|> Pleroma.Utils.compile_dir()`
HTML: Compile Scrubbers on boot This makes it possible to configure their behavior on OTP releases. 2019-12-08 08:42:40 -08:00			`\|> case do`
			`{:error, _errors, _warnings} ->`
			`raise "Compiling scrubbers failed"`

			`{:ok, _modules, _warnings} ->`
			`:ok`
			`end`
			`end`

html: allow scrubbing policies to be stackable 2018-09-15 19:07:01 -07:00			`defp get_scrubbers(scrubber) when is_atom(scrubber), do: [scrubber]`
			`defp get_scrubbers(scrubbers) when is_list(scrubbers), do: scrubbers`
			`defp get_scrubbers(_), do: [Pleroma.HTML.Scrubber.Default]`

[Credo] Remove parentesis on argument-less functions 2019-03-04 19:18:43 -08:00			`def get_scrubbers do`
Runtime configuration Related to #85 Everything should now be configured at runtime, with the exception of the `Pleroma.HTML` scrubbers (the scrubbers used can be changed at runtime, but their configuration is compile-time) because it's building a module with a macro. 2018-11-06 10:34:57 -08:00			`Pleroma.Config.get([:markup, :scrub_policy])`
html: allow scrubbing policies to be stackable 2018-09-15 19:07:01 -07:00			`\|> get_scrubbers`
			`end`

html: default to using normal scrub policy if provided scrub policy is nil 2018-09-21 18:10:53 -07:00			`def filter_tags(html, nil) do`
shame on me for not testing after revert 2018-12-30 11:44:17 -08:00			`filter_tags(html, get_scrubbers())`
			`end`

			`def filter_tags(html, scrubbers) when is_list(scrubbers) do`
			`Enum.reduce(scrubbers, html, fn scrubber, html ->`
html: allow scrubbing policies to be stackable 2018-09-15 19:07:01 -07:00			`filter_tags(html, scrubber)`
			`end)`
html: new module providing a configurable markup scrubbing policy 2018-09-09 16:29:00 -07:00			`end`

Switch from HtmlSanitizeEx to FastSanitize 2019-10-28 15:18:08 -07:00			`def filter_tags(html, scrubber) do`
			`{:ok, content} = FastSanitize.Sanitizer.scrub(html, scrubber)`
			`content`
			`end`

html: default to using normal scrub policy if provided scrub policy is nil 2018-09-21 18:10:53 -07:00			`def filter_tags(html), do: filter_tags(html, nil)`
Switch from HtmlSanitizeEx to FastSanitize 2019-10-28 15:18:08 -07:00			`def strip_tags(html), do: filter_tags(html, FastSanitize.Sanitizer.StripTags)`
Move scrubber cache-related functions to Pleroma.HTML 2018-12-30 23:19:48 -08:00
add scrubber for html special char 2019-04-30 12:52:17 -07:00			`def get_cached_scrubbed_html_for_activity(`
			`content,`
			`scrubbers,`
			`activity,`
			`key \\ "",`
			`callback \\ fn x -> x end`
			`) do`
Refactor html caching functions to have a key instead of a module, use more correct terminology and fix summaries in mastoapi 2019-04-05 05:19:44 -07:00			`key = "#{key}#{generate_scrubber_signature(scrubbers)}\|#{activity.id}"`
Remove commented-out code 2019-10-29 10:58:54 -07:00
Fix the issue with HTML scrubber 2019-04-01 01:55:59 -07:00			`Cachex.fetch!(:scrubber_cache, key, fn _key ->`
Merge branch 'develop' into feature/database-compaction 2019-04-17 02:22:32 -07:00			`object = Pleroma.Object.normalize(activity)`
add scrubber for html special char 2019-04-30 12:52:17 -07:00			`ensure_scrubbed_html(content, scrubbers, object.data["fake"] \|\| false, callback)`
Fix the issue with HTML scrubber 2019-04-01 01:55:59 -07:00			`end)`
Move scrubber cache-related functions to Pleroma.HTML 2018-12-30 23:19:48 -08:00			`end`

Refactor html caching functions to have a key instead of a module, use more correct terminology and fix summaries in mastoapi 2019-04-05 05:19:44 -07:00			`def get_cached_stripped_html_for_activity(content, activity, key) do`
			`get_cached_scrubbed_html_for_activity(`
oopsies 2019-01-04 15:25:31 -08:00			`content,`
Switch from HtmlSanitizeEx to FastSanitize 2019-10-28 15:18:08 -07:00			`FastSanitize.Sanitizer.StripTags,`
Refactor html caching functions to have a key instead of a module, use more correct terminology and fix summaries in mastoapi 2019-04-05 05:19:44 -07:00			`activity,`
add scrubber for html special char 2019-04-30 12:52:17 -07:00			`key,`
			`&HtmlEntities.decode/1`
oopsies 2019-01-04 15:25:31 -08:00			`)`
html: default to using normal scrub policy if provided scrub policy is nil 2018-09-21 18:10:53 -07:00			`end`

Move scrubber cache-related functions to Pleroma.HTML 2018-12-30 23:19:48 -08:00			`def ensure_scrubbed_html(`
			`content,`
Fix the issue with HTML scrubber 2019-04-01 01:55:59 -07:00			`scrubbers,`
add scrubber for html special char 2019-04-30 12:52:17 -07:00			`fake,`
			`callback`
Move scrubber cache-related functions to Pleroma.HTML 2018-12-30 23:19:48 -08:00			`) do`
add scrubber for html special char 2019-04-30 12:52:17 -07:00			`content =`
			`content`
			`\|> filter_tags(scrubbers)`
			`\|> callback.()`

			`if fake do`
			`{:ignore, content}`
			`else`
			`{:commit, content}`
			`end`
Move scrubber cache-related functions to Pleroma.HTML 2018-12-30 23:19:48 -08:00			`end`

			`defp generate_scrubber_signature(scrubber) when is_atom(scrubber) do`
			`generate_scrubber_signature([scrubber])`
			`end`

			`defp generate_scrubber_signature(scrubbers) do`
			`Enum.reduce(scrubbers, "", fn scrubber, signature ->`
Different caches based on the module. Remove scrubber version since it is not relevant anymore 2019-01-04 15:19:46 -08:00			`"#{signature}#{to_string(scrubber)}"`
Move scrubber cache-related functions to Pleroma.HTML 2018-12-30 23:19:48 -08:00			`end)`
			`end`
html: default to using normal scrub policy if provided scrub policy is nil 2018-09-21 18:10:53 -07:00
html: don't attempt to parse nil content 2019-02-04 21:06:17 -08:00			`def extract_first_external_url(_, nil), do: {:error, "No content"}`

html: add utility function to extract first URL from an object and cache the result 2019-01-26 06:55:12 -08:00			`def extract_first_external_url(object, content) do`
			`key = "URL\|#{object.id}"`

			`Cachex.fetch!(:scrubber_cache, key, fn _key ->`
			`result =`
			`content`
Use floki's new APIs for parsing fragments 2020-02-15 14:55:26 -08:00			`\|> Floki.parse_fragment!()`
Rich Media: Skip Microformats hashtags When fixing this problem I incorrectly assumed a.hashtag is the proper way for detecting hashtags, but it is just something Pleroma and Mastodon add. Per microformats it should be detected by the presense of rel=tag. This MR adds a check for rel=tag, but I still left a.hashtag just in case 2019-06-18 14:31:30 -07:00			`\|> Floki.filter_out("a.mention,a.hashtag,a[rel~=\"tag\"]")`
html: add utility function to extract first URL from an object and cache the result 2019-01-26 06:55:12 -08:00			`\|> Floki.attribute("a", "href")`
			`\|> Enum.at(0)`

rich media: kill some testsuite noise 2019-01-28 12:55:33 -08:00			`{:commit, {:ok, result}}`
html: add utility function to extract first URL from an object and cache the result 2019-01-26 06:55:12 -08:00			`end)`
html: new module providing a configurable markup scrubbing policy 2018-09-09 16:29:00 -07:00			`end`
			`end`