index.html

<!DOCTYPE html>
<html>

<head>
    <meta charset="utf-8">
    <!-- Meta tags for social media banners, these should be filled in appropriatly as they are your "business card" -->
    <!-- Replace the content tag with appropriate information -->
    <meta name="description" content="Instructing and Evaluating Generative Models">
    <meta property="og:title" content="Instructing and Evaluating Generative Models" />
    <meta property="og:description" content="Instructing and Evaluating Generative Models" />
    <meta property="og:url" content="https://ml-research.github.io/human-centered-genai/index.html" />
    <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X630-->
    <meta property="og:image" content="static/image/your_banner_image.png" />
    <meta property="og:image:width" content="1200" />
    <meta property="og:image:height" content="630" />


    <meta name="twitter:title" content="TWITTER BANNER TITLE META TAG">
    <meta name="twitter:description" content="TWITTER BANNER DESCRIPTION META TAG">
    <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X600-->
    <meta name="twitter:image" content="/human-centered-genai/static/images/thumbnails/sega_thumbnail.png">
    <meta name="twitter:card" content="summary_large_image">
    <!-- Keywords for your paper to be indexed by-->
    <meta name="keywords" content="KEYWORDS SHOULD BE PLACED HERE">
    <meta name="viewport" content="width=device-width, initial-scale=1">


    <title>Instructing and Evaluating Generative Models</title>
    <link rel="icon" type="image/x-icon" href="static/images/favicon.png">
    <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet">
    <link rel="stylesheet" href="static/css/bib-publication-list.css" />

    <link rel="stylesheet" href="static/css/tiles.css">
    <link rel="stylesheet" href="static/css/team.css">
    <link rel="stylesheet" href="static/css/bulma.min.css">
    <link rel="stylesheet" href="static/css/bulma-carousel.min.css">
    <link rel="stylesheet" href="static/css/bulma-slider.min.css">
    <link rel="stylesheet" href="static/css/fontawesome.all.min.css">
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
    <link rel="stylesheet" href="static/css/index.css">
    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet"
        integrity="sha384-9ndCyUaIbzAi2FUVXJi0CjmCapSmO7SnpJef0486qhLnuZ2cdeRhO02iuK6FUUVM" crossorigin="anonymous">

    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
    <script src="https://documentcloud.adobe.com/view-sdk/main.js"></script>
    <script defer src="static/js/fontawesome.all.min.js"></script>
    <script src="static/js/bulma-carousel.min.js"></script>
    <script src="static/js/bulma-slider.min.js"></script>
    <script src="static/js/index.js"></script>
    <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"
        integrity="sha384-geWF76RCwLtnZ8qwWowPQNguL3RmwHVBC9FhGdlKrxdiJJigb/j/68SIy3Te4Bkz"
        crossorigin="anonymous"></script>
</head>

<body>


    <section class="hero">
        <div class="hero-body">
            <div class="container is-max-desktop">
                <div class="columns is-centered">
                    <div class="column has-text-centered">
                        <h1 class="title is-1 publication-title">Instructing and Evaluating Generative Models</h1>
<!--

                        <div class="is-size-5 publication-authors">
                            <span class="author-block">DFKI, hessian.AI, TU Darmstadt, LAION</span>
                        </div>
-->

                        <h3>In this ongoing research effort our inter-organizational team based in Darmstadt, Germany is
                            investigating the strengths and weaknesses of large-scale generative modes.
                            Lately, our work has focused on generative image models: Evaluating their biases and
                            limitations, devising methods for reliably instructing these models and subsequently
                            mitigate
                            the underlying problems.</h3>
                    </div>
                </div>
            </div>
        </div>
    </section>


    <section class="hero">
        <div class="hero-body">
            <div class="container">
                <h2 class="title is-3">Projects</h2>
                <div class="container has-text-centered">
                    <h3 class="title is-4">Methods</h3>
                    <div class="row">
                        <div class="col tile" onclick="location.href='projects/semantic-guidance/index.html';">
                            <img src='static/images/thumbnails/sega_thumbnail.png' />
                            <div class="text">
                                <h1>Instructing text-to-image</h1>
                                <h2 class="animate-text">SEGA: Instructing Diffusion using Semantic Dimensions</h2>
                                <p class="animate-text">We present Semantic Guidance (SEGA) to enable fine-grained
                                    instruction
                                    of text-to-image models.
                                    (SEGA) allows for subtle and extensive edits, changes in composition and style, as
                                    well
                                    as
                                    optimizing the overall artistic conception. </p>

                                <div class="dots">
                                    <span></span>
                                    <span></span>
                                    <span></span>
                                </div>
                            </div>

                        </div>
                        <div class="col tile" onclick="location.href='projects/multifusion/index.html';">
                            <img src='static/images/thumbnails/multifusion_thumbnail.png' />
                            <div class="text">
                                <h1>Multi-Modal, Multi-Lingual Generation</h1>
                                <h2 class="animate-text">MultiFusion: Fusing Pre-Trained Models for Multi-Lingual,
                                    Multi-Modal
                                    Image Generation</h2>
                                <p class="animate-text">We propose MultiFusion that allows one to express complex and
                                    nuanced
                                    concepts with arbitrarily interleaved inputs of multiple modalities and languages.
                                    MutliFusion leverages pre-trained models and aligns them for integration into a
                                    cohesive
                                    system, thereby avoiding the need for extensive training from scratch. </p>
                                <div class="dots">
                                    <span></span>
                                    <span></span>
                                    <span></span>
                                </div>
                            </div>
                        </div>
                    </div>
                </div>
                <div class="container has-text-centered">
                    <div class="row">
                        <div class="col tile" onclick="location.href='https://leditsplusplus-project.static.hf.space';">
                            <img src='static/images/thumbnails/ledits_thumbnail.png' />
                            <div class="text">
                                <h1>Real Image Editing</h1>
                                <h2 class="animate-text">LEDITS++: Limitless Image Editing using Text-to-Image Models</h2>
                                <p class="animate-text">We propose LEDITS++ a novel method for textual editing of images using diffusion models.
                                LEDITS++ is architecture agnostic, computationally efficient, supports versatile edis and limits changes to
                                the relevant image regions.</p>
                                <div class="dots">
                                    <span></span>
                                    <span></span>
                                    <span></span>
                                </div>
                            </div>
                        </div>
                        <div class="col tile" onclick="location.href='projects/llavaguard/index.html';">
                            <img src='static/images/thumbnails/llavaguard.jpeg' />
                            <div class="text"> 
                                <h1>Multi-Modal Content Moderation</h1>
                                <h2 class="animate-text">LlavaGuard: Leveraging VLMs for Multi-Modal Content Moderation
                                    Image Generation</h2>
                                <p class="animate-text">We propose LlavaGuard that allows conduct safety analyis of vision datasets and generative models.
                                    To this end, we use a taxnomy that can be adjusted flexibly. LlavaGuard is architecture agnostic and can be used with any generative model.
                                    </p>
                            </div>
                        </div>

                    </div>
                </div>
            <div class="container has-text-centered">
                    <h3 class="title is-4">Responsible AI</h3>
                    <div class="row">
                        <div class="col tile" onclick="location.href='projects/safe-latent-diffusion/index.html';">
                            <img src='static/images/thumbnails/sld_thumbnail.png' />
                            <div class="text">
                                <h1>Mitigating Inappropriateness</h1>
                                <h2 class="animate-text">Safe Latent Diffusion: Mitigating Inappropriate Degeneration in
                                    Diffusion Models</h2>
                                <p class="animate-text">
                                    Safe Latent Diffusion suppresses inappropriate degeneration of generative image
                                    models.
                                    Additionally, we establish a novel image generation test bed-inappropriate
                                    image prompts (I2P)-containing dedicated, real-world image-to-text prompts covering
                                    concepts
                                    such as nudity and violence. </p>
                                <div class="dots">
                                    <span></span>
                                    <span></span>
                                    <span></span>
                                </div>
                            </div>
                        </div>
                        <div class="col tile" onclick="location.href='projects/fair-diffusion/index.html';">
                            <img src='static/images/thumbnails/fairdiff_thumbnail.png' />
                            <div class="text">
                                <h1>Instructing on Fairness</h1>
                                <h2 class="animate-text">Fair Diffusion: Instructing Text-to-Image Generation Models on
                                    Fairness</h2>
                                <p class="animate-text">We investigate biases of text-to-image models across all
                                    components
                                    of
                                    the pipeline.
                                    We propose Fair Diffusion for shifting a bias, based on human instructions, in any
                                    direction
                                    yielding arbitrarily new proportions for, e.g., identity groups</p>
                                <div class="dots">
                                    <span></span>
                                    <span></span>
                                    <span></span>
                                </div>
                            </div>
                        </div>
                        <div class="col tile" onclick="location.href='projects/t2i-eval/index.html';">
                            <img src='static/images/thumbnails/benchmark_thumbnail.png' />
                            <div class="text">
                                <h1>Large-scale Evaluation</h1>
                                <h2 class="animate-text">Mitigating Inappropriateness in Image Generation: Can there be
                                    Value in
                                    Reflecting the World's Ugliness?</h2>
                                <p class="animate-text">We demonstrate inappropriate degeneration on a large-scale for
                                    various
                                    generative text-to-image models, thus motivating the need for monitoring and
                                    moderating
                                    them
                                    at deployment.
                                    To this end, we evaluate mitigation strategies at inference to suppress the
                                    generation
                                    of
                                    inappropriate content. </p>
                                <div class="dots">
                                    <span></span>
                                    <span></span>
                                    <span></span>
                                </div>
                            </div>
                        </div>

                    </div>
                </div>
<!--
                <div class="container has-text-centered">
                    <h3 class="title is-4">Privacy &amp; Security</h3>
                    <div class="row">
                        <div class="col tile" onclick="location.href='projects/clip';">
                            <img src='static/images/thumbnails/clip_thumbnail.png' />
                            <div class="text">
                                <h1>Privacy in CLIP</h1>
                                <h2 class="animate-text">Does CLIP know my face?</h2>
                                <p class="animate-text">Our large-scale experiments on CLIP demonstrate that individuals
                                    used for training can be identified with very high accuracy. We confirm that the
                                    model has learned to associate names with depicted individuals, implying the
                                    existence of sensitive information that can be extracted by adversaries.
                                </p>
                                <div class="dots">
                                    <span></span>
                                    <span></span>
                                    <span></span>
                                </div>
                            </div>
                        </div>
                        <div class="col tile" onclick="location.href='projects/backdoor';">
                            <img src='static/images/thumbnails/rickrolling_thumbnail.png' />
                            <div class="text">
                                <h1>Backdoor Attacks</h1>
                                <h2 class="animate-text">Rickrolling the Artist: Injecting Invisible Backdoors into
                                    Text-Guided Image Generation Models</h2>
                                <p class="animate-text">We introduce backdoor attacks against text-guided generative
                                    models and demonstrate that their text encoders pose a major tampering risk.
                                </p>
                                <div class="dots">
                                    <span></span>
                                    <span></span>
                                    <span></span>
                                </div>
                            </div>
                        </div>

                    </div>
                </div>
-->

            </div>
        </div>
    </section>


    <section class="hero is-small is-light">
        <div class="hero-body">
            <div class="container">
                <!-- Paper video. -->
                <h2 class="title is-3">People</h2>
                <div class="row">
                    <div class="col-md-4 col-sm-6">
                        <div class="our-team">
                            <div class="team-image">
                                <img src="static/images/mbrack.JPG" alt="Profile picture of Manuel Brack">
                                <p class="description">
                                    Manuel is a PhD candidate at the German Research Center for AI (DFKI) and TU
                                    Darmstadt.
                                    In his research he
                                    focuses on human-centric AI in the context of large-scale generative models.
                                </p>
                                <ul class="social">
                                    <li><a href="https://www.aiml.informatik.tu-darmstadt.de/people/mbrack"
                                            target="_blank"><i class="fa fa-user"></i></a></li>
                                    <li><a href="https://www.linkedin.com/in/manuel-brack-17b07718b/" target="_blank"><i
                                                class="fab fa-linkedin-in"></i></a></li>
                                    <li><a href="https://twitter.com/MBrack_AIML" target="_blank"><i
                                                class="fab fa-twitter"></i></a></li>
                                    <li><a href="https://scholar.google.com/citations?user=kJ9Abf8AAAAJ"
                                            target="_blank"><i class="ai ai-google-scholar"></i></a></li>
                                </ul>
                            </div>
                            <div class="team-info">
                                <h3 class="title">Manuel Brack</h3>
                                <span class="post">DFKI, TU Darmstadt</span>
                            </div>
                        </div>
                    </div>

                    <div class="col-md-4 col-sm-6">
                        <div class="our-team">
                            <div class="team-image">
                                <img src="static/images/bdeiseroth.jpg" alt="Profile picture of Björn Deiseroth">
                                <p class="description">
                                    Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent urna diam, maximus
                                    ut
                                    ullamcorper quis, placerat id eros. Duis semper justo sed condimentum rutrum. Nunc
                                    tristique purus turpis. Maecenas vulputate.
                                </p>
                                <ul class="social">
                                    <li><a href="#"><i class="fa fa-user"></i></a></li>
                                    <li><a href="#"><i class="fab fa-linkedin-in"></i></a></li>
                                    <li><a href="#"><i class="fab fa-twitter"></i></a></li>
                                    <li><a href="#"><i class="ai ai-google-scholar"></i></a></li>
                                </ul>
                            </div>
                            <div class="team-info">
                                <h3 class="title">Björn Deiseroth</h3>
                                <span class="post">Aleph Alpha, TU Darmstadt</span>
                            </div>
                        </div>
                    </div>
                    <div class="col-md-4 col-sm-6">
                        <div class="our-team">
                            <div class="team-image">
                                <img src="static/images/ffriedrich.png" alt="Profile picture of Felix Friedrich">
                                <p class="description">
                                    Felix is a PhD candidate at hessian.AI and TU Darmstadt. In his research he focuses
                                    on
                                    fairness and explainability in AI models integrating the human in the loop.
                                </p>
                                <ul class="social">
                                    <li><a href="https://www.aiml.informatik.tu-darmstadt.de/people/ffriedrich"
                                            target="_blank"><i class="fa fa-user"></i></a></li>
                                    <li><a href="https://www.linkedin.com/in/felix-friedrich-425587281/" target="_blank"><i
                                            class="fab fa-linkedin-in"></i></a></li>
                                    <li>
                                        <a href="https://www.semanticscholar.org/author/Felix-Friedrich/2055616945?sort=influence"
                                            target="_blank"><i class="ai ai-semantic-scholar ai-2x"></i></a>
                                    </li>

                                </ul>
                            </div>
                            <div class="team-info">
                                <h3 class="title">Felix Friedrich</h3>
                                <span class="post">TU Darmstadt, hessian.AI</span>
                            </div>
                        </div>
                    </div>

                </div>
                <br>
                <div class="row">
                    <div class="col-md-4 col-sm-6">
                        <div class="our-team">
                            <div class="team-image">
                                <img src="static/images/dhintersdorf.jpg" alt="Profile picture of Dominik Hintersdorf">
                                <p class="description">
                                    Dominik is a PhD candidate at TU Darmstadt. In his research, he investigates
                                    security
                                    and privacy issues of deep learning systems in the context of multi-modal models.
                                </p>
                                <ul class="social">
                                    <li><a href="https://ml-research.github.io/people/dhintersdorf"><i
                                                class="fa fa-user"></i></a></li>
                                    <li><a href="https://www.linkedin.com/in/dominikhintersdorf"><i
                                                class="fab fa-linkedin-in"></i></a></li>
                                    <li><a href="https://twitter.com/D0miH"><i class="fab fa-twitter"></i></a></li>
                                    <li><a href="https://scholar.google.com/citations?user=acFqFjYAAAAJ"><i
                                                class="ai ai-google-scholar"></i></a></li>
                                </ul>
                            </div>
                            <div class="team-info">
                                <h3 class="title">Dominik Hintersdorf</h3>
                                <span class="post">TU Darmstadt</span>
                            </div>
                        </div>
                    </div>

                    <div class="col-md-4 col-sm-6">
                        <div class="our-team">
                            <div class="team-image">
                                <img src="static/images/pschramowski.jpeg" alt="Profile picture of Patrick Schramowski">
                                <p class="description">
                                    Patrick is a senior researcher at the German Research Center for AI (DFKI) and
                                    Hessian.ai. In his research he
                                    focuses on human-centric AI and AI alignment in the context of large-scale
                                    generative
                                    models.
                                </p>
                                <ul class="social">
                                    <li><a href="https://www.aiml.informatik.tu-darmstadt.de/people/pschramowski"
                                            target="_blank"><i class="fa fa-user"></i></a></li>
                                    <li><a href="https://www.linkedin.com/in/patrick-schramowski-7b9880109/"
                                            target="_blank"><i class="fab fa-linkedin-in"></i></a></li>
                                    <li><a href="https://twitter.com/schrame90" target="_blank"><i
                                                class="fab fa-twitter"></i></a></li>
                                    <li><a href="https://scholar.google.com/citations?user=GD481RkAAAAJ"
                                            target="_blank"><i class="ai ai-google-scholar"></i></a></li>
                                </ul>
                            </div>
                            <div class="team-info">
                                <h3 class="title">Patrick Schramowski</h3>
                                <span class="post">DFKI, TU Darmstadt, hessian.AI</span>
                            </div>
                        </div>
                    </div>
                    <div class="col-md-4 col-sm-6">
                        <div class="our-team">
                            <div class="team-image">
                                <img src="static/images/lstruppek.jpg" alt="Profile picture of Lukas Struppek">
                                <p class="description">
                                    Lukas is a PhD candidate at Darmstadt. In his research, he investigates security
                                    and privacy issues of deep learning systems in the context of generative models.
                                </p>
                                <ul class="social">
                                    <li><a href="https://www.ml.informatik.tu-darmstadt.de/people/lstruppek/index.html"><i
                                                class="fa fa-user"></i></a></li>
                                    <li><a href="https://www.linkedin.com/in/lukas-struppek/"><i
                                                class="fab fa-linkedin-in"></i></a></li>
                                    <li><a href="https://twitter.com/LukasStruppek"><i class="fab fa-twitter"></i></a>
                                    </li>
                                    <li><a href="https://scholar.google.com/citations?user=tU8K5qsAAAAJ"><i
                                                class="ai ai-google-scholar"></i></a></li>
                                </ul>
                            </div>
                            <div class="team-info">
                                <h3 class="title">Lukas Struppek</h3>
                                <span class="post">Tu Darmstadt</span>
                            </div>
                        </div>
                    </div>

                </div>

            </div>
        </div>
    </section>

    <section class="hero is-small">
        <div class="hero-body">
            <div class="container">
                <!-- Paper video. -->
                <h2 class="title is-3">Relevant Publications</h2>
                <noscript>
                    <!-- bibtex source hidden by default, show it if JS disabled -->
                    <style>
                        #bibtex {
                            display: block;
                        }
                    </style>
                </noscript>
                <table id="pubTable" class="display"></table>
                <pre id="bibtex" style="display:none;">
@inproceedings{schramowski2022safe,
      Anote = {./images/schramowski2022safe.png},
      title={Safe Latent Diffusion: Mitigating Inappropriate Degeneration in Diffusion Models},
      author={Patrick Schramowski and Manuel Brack and Björn Deiseroth and Kristian Kersting},
      booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
      year = {2023},
      month={Jun},
      Note = {Text-conditioned image generation models have recently achieved astonishing results in image quality and text alignment and are consequently employed in a fast-growing number of applications. Since they are highly data-driven, relying on billion-sized datasets randomly scraped from the internet, they also suffer, as we demonstrate, from degenerated and biased human behavior. In turn, they may even reinforce such biases. To help combat these undesired side effects, we present safe latent diffusion (SLD). Specifically, to measure the inappropriate degeneration due to unfiltered and imbalanced training sets, we establish a novel image generation test bed-inappropriate image prompts (I2P)-containing dedicated, real-world image-to-text prompts covering concepts such as nudity and violence. As our exhaustive empirical evaluation demonstrates, the introduced SLD removes and suppresses inappropriate image parts during the diffusion process, with no additional training required and no adverse effect on overall image quality or text alignment.},
      Pages = {},
      Keywords = {Safety, Text-to-Image Synthesis, Text-Guided Image Generation, Stable Diffusion, Ethics},
      Url={https://arxiv.org/abs/2211.05105},
      highlight={1}
}
      @misc{brack2022Stable,
      Anote = {./images/sega_graphic.png},
      title={The Stable Artist: Steering Semantics in Diffusion Latent Space},
      author={Manuel Brack and Patrick Schramowski and Felix Friedrich and Dominik Hintersdorf and Kristian Kersting},
      howpublished = {arXiv preprint arXiv:2212.06013},
      year = {2022},
      month={Dez},
      Note = {Large, text-conditioned generative diffusion models have recently gained a lot of attention for their impressive performance in generating high-fidelity images from text alone. However, achieving high-quality results is almost unfeasible in a one-shot fashion. On the contrary, text-guided image generation involves the user making many slight changes to inputs in order to iteratively carve out the envisioned image. However, slight changes to the input prompt often lead to entirely different images being generated, and thus the control of the artist is limited in its granularity. To provide flexibility, we present the Stable Artist, an image editing approach enabling fine-grained control of the image generation process. The main component is semantic guidance (SEGA) which steers the diffusion process along variable numbers of semantic directions. This allows for subtle edits to images, changes in composition and style, as well as optimization of the overall artistic conception. Furthermore, SEGA enables probing of latent spaces to gain insights into the representation of concepts learned by the model, even complex ones such as 'carbon emission'. We demonstrate the Stable Artist on several tasks, showcasing high-quality image editing and composition.},
      Pages = {},
      Keywords = {Representations, Text-to-Image Synthesis, Text-Guided Image Generation, Stable Diffusion, Concepts, Semantics},
      Url={https://arxiv.org/abs/2212.06013},
      highlight={0}
}
@inproceedings{brack2023sega,
      Anote = {./images/sega_graphic.png},
      title={SEGA: Instructing Text-to-Image Models using Semantic Guidance},
      author={Manuel Brack and Felix Friedrich and Dominik Hintersdorf and Lukas Struppek and Patrick Schramowski and Kristian Kersting},
      year = {2023},
      month={Dez},
      Pages = {},
      booktitle = {Proceedings of the 37th Conference on Neural Information Processing Systems (NeurIPS)},
      Note = {Text-to-image diffusion models have recently received a lot of interest for their astonishing ability to produce high-fidelity images from text only. However, achieving one-shot generation that aligns with the user’s intent is nearly impossible, yet small changes to the input prompt often result in very different images. This leaves the user with little semantic control. To put the user in control, we show how to interact with the diffusion process to flexibly steer it along semantic directions. This semantic guidance (SEGA) generalizes to any generative architecture using classifier-free guidance. More importantly, it allows for subtle and extensive edits, composition and style changes, and optimizing the overall artistic conception. We demonstrate SEGA’s effectiveness on both latent and pixel-based diffusion models such as Stable Diffusion, Paella, and DeepFloyd-IF using a variety of tasks, thus providing strong evidence for its versatility and flexibility.},
      Keywords = {Representations, Text-to-Image Synthesis, Text-Guided Image Generation, Stable Diffusion, Concepts, Semantics},
      Url={https://arxiv.org/abs/2211.05105},
      highlight={1}
}
@misc{friedrich2023fair,
      Anote = {./images/ffriedrich_fair_2023.png},
      title={Fair Diffusion: Instructing Text-to-Image Generation Models on Fairness},
      author={Felix Friedrich and Manuel Brack and Dominik Hintersdorf and Lukas Struppek and Patrick Schramowski and Sasha Luccioni and Kristian Kersting},
      howpublished = {arXiv preprint arXiv:2302.10893},
      year = {2023},
      month={Feb},
      Note = {Generative AI models have recently achieved astonishing results in quality and are consequently employed in a fast-growing number of applications. However, since they are highly data-driven, relying on billion-sized datasets randomly scraped from the internet, they also suffer from degenerated and biased human behavior, as we demonstrate. In fact, they may even reinforce such biases. To not only uncover but also combat these undesired effects, we present a novel strategy, called Fair Diffusion, to attenuate biases after the deployment of generative text-to-image models. Specifically, we demonstrate shifting a bias, based on human instructions, in any direction yielding arbitrarily new proportions for, e.g., identity groups. As our empirical evaluation demonstrates, this introduced control enables instructing generative image models on fairness, with no data filtering and additional training required.},
      Pages = {},
      Keywords = {Fairness, Text-to-Image Synthesis, Text-Guided Image Generation, Stable Diffusion, AI Ethics},
      Url={https://arxiv.org/abs/2302.10893},
      highlight={1}
}
@misc{struppek23caia,
  Anote={./images/caia.jpeg},
  author = {Lukas Struppek and Dominik Hintersdorf and Felix Friedrich and Manuel Brack and Patrick Schramowski and Kristian Kersting},
  title = {Image Classifiers Leak Sensitive Attributes About Their Classes},
  howpublished = {arXiv preprint arXiv:2303.09289},
  year = {2023},
  Url = {https://arxiv.org/pdf/2303.09289},
  Pages = {},
  Note = {Neural network-based image classifiers are powerful tools for computer vision tasks, but they inadvertently reveal sensitive
  attribute information about their classes, raising concerns about their privacy. To investigate this privacy leakage, we introduce the first
  Class Attribute Inference Attack (Caia), which leverages recent advances in text-to-image synthesis to infer sensitive attributes of individual
  classes in a black-box setting, while remaining competitive with related white-box attacks. Our extensive experiments in the face recognition
  domain show that Caia can accurately infer undisclosed sensitive attributes, such as an individual's hair color, gender and racial appearance,
  which are not part of the training labels. Interestingly, we demonstrate that adversarial robust models are even more vulnerable to such privacy
  leakage than standard models, indicating that a trade-off between robustness and privacy exists.},
  Keywords = {Privacy, Text-to-Image Synthesis, Text-Guided Image Generation, Stable Diffusion},
  highlight={0}
}

@incollection{brack2023mitigating,
  author = {Manuel Brack and Felix Friedrich and Patrick Schramowski and Kristian Kersting},
  title = {Mitigating Inappropriateness in Image Generation: Can there be Value in Reflecting the World's Ugliness?},
  booktitle = {ICML 2023 Workshop on Challenges of Deploying Generative AI},
  year = {2023},
  Pages = {},
  Note = {Text-conditioned image generation models have recently achieved astonishing results in image quality and text alignment and are consequently employed in a fast-growing number of applications. Since they are highly data-driven, relying on billion-sized datasets randomly scraped from the web, they also reproduce inappropriate human behavior. Specifically, we demonstrate inappropriate degeneration on a large-scale for various generative text-to-image models, thus motivating the need for monitoring and moderating them at deployment. To this end, we evaluate mitigation strategies at inference to suppress the generation of inappropriate content. Our findings show that we can use models' representations of the world's ugliness to align them with human preferences.},
  Keywords = {Image Synthesis, Image Generation, Diffusion, AI Ethics, Inappropriatness, Evaluation, Mitigation},
  Url = {https://arxiv.org/pdf/2305.18398},
  highlight={1}
}

@inproceedings{deiseroth2023atman,
  Anote={./images/deb2023atman.png},
  author       = {Björn Deiseroth and Mayukh Deb and Samuel Weinbach and Manuel Brack and Patrick Schramowski and Kristian Kersting},
  title        = {AtMan: Understanding Transformer Predictions Through Memory Efficient Attention Manipulation},
  Keywords = {Explainable AI, Transformer, Large Language Models, Multimodal, Computer Vision},
  year         = {2023},
  Url          = {https://arxiv.org/abs/2301.08110},
  booktitle = {Proceedings of the 37th Conference on Neural Information Processing Systems (NeurIPS)},
  Note= {Generative transformer models have become increasingly complex, with large numbers of parameters and the ability to process multiple input modalities. Current methods for explaining their predictions are resource-intensive. Most crucially, they require prohibitively large amounts of additional memory since they rely on backpropagation which allocates almost twice as much GPU memory as the forward pass. This renders it difficult, if not impossible, to use explanations in production. We present AtMan that provides explanations of generative transformer models at almost no extra cost. Specifically, AtMan is a modality-agnostic perturbation method that manipulates the attention mechanisms of transformers to produce relevance maps for the input with respect to the output prediction. Instead of using backpropagation, AtMan applies a parallelizable token-based search method relying on cosine similarity neighborhood in the embedding space. Our exhaustive experiments on text and image-text benchmarks demonstrate that AtMan outperforms current state-of-the-art gradient-based methods on several metrics and models while being computationally efficient. As such, AtMan is suitable for use in large model inference deployments.},
  highlight={0}
}

@inproceedings{bellagente2023multifusion,
  Anote={./images/bellagente2023multifusion.png},
  author = {Marco Bellagente and Manuel Brack and Hannah Teufel and Felix Friedrich and Björn Deiseroth and Constantin Eichenberg and Andrew Dai and Robert Baldock and Souradeep Nanda and Koen Oostermeijer and Andres Felipe Cruz-Salinas and Patrick Schramowski and Kristian Kersting and Samuel Weinbach},
  title = {MultiFusion: Fusing Pre-Trained Models for Multi-Lingual, Multi-Modal Image Generation},
  year = {2023},
  Url = {https://arxiv.org/abs/2305.15296},
  Pages = {},
  Note = {The recent popularity of text-to-image diffusion models (DM) can largely be attributed to the intuitive interface they provide to users. The intended generation can be expressed in natural language, with the model producing faithful interpretations of text prompts. However, expressing complex or nuanced ideas in text alone can be difficult. To ease image generation, we propose MultiFusion that allows one to express complex and nuanced concepts with arbitrarily interleaved inputs of multiple modalities and languages. MutliFusion leverages pre-trained models and aligns them for integration into a cohesive system, thereby avoiding the need for extensive training from scratch. Our experimental results demonstrate the efficient transfer of capabilities from individual modules to the downstream model. Specifically, the fusion of all independent components allows the image generation module to utilize multilingual, interleaved multimodal inputs despite being trained solely on monomodal data in a single language.},
  Keywords = {Image Synthesis, Image Generation, Diffusion, Multimodality, Multilingualism},
  booktitle = {Proceedings of the 37th Conference on Neural Information Processing Systems (NeurIPS)},
  highlight={1}
}
@inproceedings{struppek23rickrolling,
  Anote = {./images/struppek_rickrolling.jpg},
  author = {Lukas Struppek and Dominik Hintersdorf and Kristian Kersting},
  title = {Rickrolling the Artist: Injecting Backdoors into Text Encoders for Text-to-Image Synthesis},
  Note = {While text-to-image synthesis currently enjoys great popularity among researchers and the general public, the security of these models has been neglected so far. Many text-guided image generation models rely on pre-trained text encoders from external sources, and their users trust that the retrieved models will behave as promised. Unfortunately, this might not be the case. We introduce backdoor attacks against text-guided generative models and demonstrate that their text encoders pose a major tampering risk. Our attacks only slightly alter an encoder so that no suspicious model behavior is apparent for image generations with clean prompts. By then inserting a single character trigger into the prompt, e.g., a non-Latin character or emoji, the adversary can trigger the model to either generate images with pre-defined attributes or images following a hidden, potentially malicious description. We empirically demonstrate the high effectiveness of our attacks on Stable Diffusion and highlight that the injection process of a single backdoor takes less than two minutes. Besides phrasing our approach solely as an attack, it can also force an encoder to forget phrases related to certain concepts, such as nudity or violence, and help to make image generation safer.},
  year={2023},
  Pages = {},
  Keywords = {Backdoor Attack, Generative AI, CLIP, Text2Image Synthesis, Homoglyphs},
  booktitle={Proceedings of the 19th IEEE/CVF International Conference on Computer Vision (ICCV)},
  Url={https://arxiv.org/pdf/2211.02408.pdf},
  highlight={0}
   }

@incollection{brack2023ledits,
      Anote = {./images/mbrack_ledits_pp.png},
      title={LEDITS++: Limitless Image Editing using Text-to-Image Models},
      author={Manuel Brack and Felix Friedrich and Katharina Kornmeier and Linoy Tsaban and Patrick Schramowski and Kristian Kersting and Apolinaros Passos},
      booktitle = {Workshop on Machine Learning for Creativity and Design at NeurIPS},
      year = {2023},
      month={Dez},
      Note = {Text-to-image diffusion models have recently received a lot of interest for their astonishing ability to produce high-fidelity images from text only. Subsequent research efforts are aiming to exploit the capabilities of these models and leverage them for intuitive, textual image editing. However, existing methods often require time-consuming fine-tuning and lack native support for performing multiple edits simultaneously. To address these issues, we introduce LEDITS++ , an efficient yet versatile technique for image editing using text-to-image models. LEDITS++ requires no tuning nor optimization, runs in a few diffusion steps, natively supports multiple simultaneous edits, inherently limits changes to relevant image regions, and is architecture agnostic.},
      Pages = {},
      Keywords = {Image Editing, Text-to-Image Synthesis, Text-Guided Image Generation, Stable Diffusion, Semantics},
      Url={https://arxiv.org/abs/2311.16711},
      highlight={1}
}
@incollection{brack2023distilling,
      Anote = {./images/brack2023distilling.png},
      title={Distilling Adversarial Prompts from Safety Benchmarks: Report for the Adversarial Nibbler Challenge},
      author={Manuel Brack and Patrick Schramowski and Kristian Kersting},
      booktitle = {Working Notes of the AACL Workshop on the ART of Safety (ARTS): Workshop on Adversarial testing and Red-Teaming for generative AI},
      year = {2023},
      Note = {Text-conditioned image generation models have recently achieved astonishing image quality and alignment results. Consequently, they are employed in a fast-growing number of applications. Since they are highly data-driven, relying on billion-sized datasets randomly scraped from the web, they also produce unsafe content. As a contribution to the Adversarial Nibbler challenge, we distill a large set of over 1,000 potential adversarial inputs from existing safety benchmarks. Our analysis of the gathered prompts and corresponding images demonstrates the fragility of input filters and provides further insights into systematic safety issues in current generative image models.},
      Pages = {},
      Keywords = {Text-to-Image Synthesis, Text-Guided Image Generation, Stable Diffusion, Safety, Adversarial Prompting},
      Url={https://arxiv.org/abs/2309.11575},
      highlight={0}
}
@incollection{hintersdorf23defendingbugs,
  Anote={./images/defending_with_backdoors.png},
  author = {Dominik Hintersdorf and Lukas Struppek and Daniel Neider and Kristian Kersting},
  title = {Defending Our Privacy With Backdoors},
  year = {2023},
  Url = {https://arxiv.org/abs/2310.08320},
  Pages = {},
  booktitle={NeurIPS 2023 Workshop on Backdoors in Deep Learning - The Good, the Bad, and the Ugly},
  Note = {The proliferation of large AI models trained on uncurated, often sensitive web-scraped data has raised significant privacy concerns. One of the concerns is that adversaries can extract information about the training
  data using privacy attacks. Unfortunately, the task of removing specific information from the models without sacrificing performance is not straightforward and has proven to be challenging. We propose a rather easy yet
  effective defense based on backdoor attacks to remove private information such as names of individuals from models, and focus in this work on text encoders. Specifically, through strategic insertion of backdoors, we
  align the embeddings of sensitive phrases with those of neutral terms-"a person" instead of the person's name. Our empirical results demonstrate the effectiveness of our backdoor-based defense on CLIP by assessing its
  performance using a specialized privacy attack for zero-shot classifiers. Our approach provides not only a new "dual-use" perspective on backdoor attacks, but also presents a promising avenue to enhance the privacy of
  individuals within models trained on uncurated web-scraped data.},
  Keywords = {Security, Privacy, Backdoor Attacks, CLIP, Identity Inference Attacks},
      highlight={0}
}
@misc{struppek23leveraging,
  Anote={./images/backdoor_defense.png},
  author = {Lukas Struppek and Martin B. Hentschel and Clifton Poth and Dominik Hintersdorf and Kristian Kersting},
  title = {Leveraging Diffusion-Based Image Variations for Robust Training on Poisoned Data},
  Howpublished = {arXiv preprint arXiv:2310.06372},
  year = {2023},
  Url = {https://arxiv.org/pdf/2310.06372},
  Pages = {},
  Note = {Backdoor attacks pose a serious security threat for training neural networks as they surreptitiously introduce hidden functionalities into a model.
  Such backdoors remain silent during inference on clean inputs, evading detection due to inconspicuous behavior. However, once a specific trigger pattern appears in the input data,
  the backdoor activates, causing the model to execute its concealed function. Detecting such poisoned samples within vast datasets is virtually impossible through manual inspection.
  To address this challenge, we propose a novel approach that enables model training on potentially poisoned datasets by utilizing the power of recent diffusion models.
  Specifically, we create synthetic variations of all training samples, leveraging the inherent resilience of diffusion models to potential trigger patterns in the data.
  By combining this generative approach with knowledge distillation, we produce student models that maintain their general performance on the task while exhibiting robust resistance to backdoor triggers.},
  Keywords = {Security, Backdoor Attacks, Stable Diffusion, Text-to-Image Synthesis},
      highlight={0}
}
@article{struppek23homoglyphs,
  Anote = {./images/struppek_biased_artist.jpg},
  author = {Lukas Struppek and Dominik Hintersdorf and Felix Friedrich and Manuel Brack and Patrick Schramowski and Kristian Kersting},
  title = {Exploiting Cultural Biases via Homoglyphs in Text-to-Image Synthesis},
  Journal = {Journal of Artificial Intelligence Research (JAIR)},
  year = {2023},
  month={Sep},
  Note = {Models for text-to-image synthesis, such as DALL-E 2 and Stable Diffusion, have recently drawn a lot of interest from academia and the general public. These models are capable of producing high-quality images that depict a variety of concepts and styles when conditioned on textual descriptions. However, these models adopt cultural characteristics associated with specific Unicode scripts from their vast amount of training data, which may not be immediately apparent. We show that by simply inserting single non-Latin characters in a textual description, common models reflect cultural stereotypes and biases in their generated images. We analyze this behavior both qualitatively and quantitatively, and identify a model’s text encoder as the root cause of the phenomenon. Additionally, malicious users or service providers may try to intentionally bias the image generation to create racist stereotypes by replacing Latin characters with similarly-looking characters from non-Latin scripts, so-called homoglyphs. To mitigate such unnoticed script attacks, we propose a novel homoglyph unlearning method to fine-tune a text encoder, making it robust against homoglyph manipulations.},
  Pages = {},
  Keywords = {Text-to-Image Synthesis, Text-Guided Image Generation, DALL-E 2, Stable Diffusion, Computer Vision},
  Url={https://arxiv.org/pdf/2209.08891.pdf},
  highlight={0}
  }

@misc{hintersdorf2022clipping_privacy,
      Anote = {./images/hintersdorf2022clipping_privacy.png},
      title={Does CLIP Know My Face?},
      author={Dominik Hintersdorf and Lukas Struppek and Manuel Brack and Felix Friedrich and Patrick Schramowski and Kristian Kersting},
      year={2022},
      month={Sep},
      Howpublished = {arXiv preprint arXiv:2209.07341},
      Note = {With the rise of deep learning in various applications, privacy concerns around the protection of training data has become a critical area of research. Whereas prior studies have focused on privacy risks in single-modal models, we introduce a novel method to assess privacy for multi-modal models, specifically vision-language models like CLIP. The proposed Identity Inference Attack (IDIA) reveals whether an individual was included in the training data by querying the model with images of the same person. Letting the model choose from a wide variety of possible text labels, the model reveals whether it recognizes the person and, therefore, was used for training. Our large-scale experiments on CLIP demonstrate that individuals used for training can be identified with very high accuracy. We confirm that the model has learned to associate names with depicted individuals, implying the existence of sensitive information that can be extracted by adversaries. Our results highlight the need for stronger privacy protection in large-scale models and suggest that IDIAs can be used to prove the unauthorized use of data for training and to enforce privacy laws.},
      Pages = {},
      Keywords = {Identity Inference Attacks, Privacy, Computer Vision, Pre-trained models, CLIP, Deep Learning},
      Url={https://arxiv.org/pdf/2209.07341.pdf},
      highlight={0}
}
@inproceedings{brack2023illume,
  url = {https://arxiv.org/abs/2208.08241},
  author = {Manuel Brack and Patrick Schramowski and Björn Deiseroth and Kristian Kersting},
  title = {ILLUME: Rationalizing Vision-Language Models through Human Interactions},
  Anote = {./images/brack2022illume.png},
  Keywords = {Alignement, Self-Generated Explanations, XAI, Explanatory Interactive Learning},
  Note = {Bootstrapping from pre-trained language models has been proven to be an efficient approach for building vision-language models (VLM) for tasks such as image captioning or visual question answering. However, outputs of these models rarely align with user's rationales for specific answers. In order to improve this alignment and reinforce commonsense reasons, we propose a tuning paradigm based on human interactions with machine generated data. Our ILLUME executes the following loop: Given an image-question-answer prompt, the VLM samples multiple candidate rationales, and a human critic provides minimal feedback via preference selection, used for fine-tuning. This loop increases the training data and gradually carves out the VLM's rationalization capabilities that are aligned with human intend. Our exhaustive experiments demonstrate that ILLUME is competitive with standard supervised fine-tuning while using significantly fewer training data and only requiring minimal feedback.},
  year={2023},
  booktitle = {Proceedings of the 40th International Conference on Machine Learning (ICML)},
  Url = {https://arxiv.org/pdf/2208.08241.pdf},
      highlight={0}
}
@inproceedings{friedrich2022hhai,
    Anote = {./images/friedrich2022hhai.png},
    title={Interactively Providing Explanations for Transformer Language Models},
    author={Felix Friedrich and Patrick Schramowski and Christopher Tauchmann and Kristian Kersting},
    Note = {Transformer language models (LMs) are state of the art in a multitude of NLP tasks. Despite these successes, their opaqueness remains problematic, especially as the training data might be unfiltered and contain biases. As a result, ethical concerns about these models arise, which can have a substantial negative impact on society as they get increasingly integrated into our lives. Therefore, it is not surprising that a growing body of work aims to provide interpretability and explainability to black-box LMs: Recent evaluations of saliency or attribution methods find that, while intriguing, different methods assign importance to different inputs for the same outputs, thus encouraging misinterpretation and reporting bias. Moreover, these methods primarily focus on post-hoc explanations of (sometimes spurious) input-output correlations. Instead, we emphasize using (interactive) prototype networks directly incorporated into the model architecture and hence explain the reasoning behind the network’s decisions.},
    year={2022},
    Pages = {},
    Keywords = {Transformer, Large Language Models, Prototype Layers, Explainable AI, Explanatory Interactive Learning},
    booktitle= {Proceedings of the 1st Conference of Hybrid Human Artificial Intelligence (HHAI) and in Frontiers in Artificial Intelligence and Applications},
    Url={./papers/friedrich2022hhai.pdf},
    highlight={0}
}
@article{schramowski2022nmi_moral,
  Anote = {./images/schramowski2022nmi_moral.png},
  title = {Large pre-trained language models contain human-like biases of what is right and wrong to do},
  Author = {Patrick Schramowski and Cigdem Turan and Nico Andersen and Constantin A. Rothkopf and Kristian Kersting},
  Journal = {Nature Machine Intelligence},
  Note = {Artificial writing is permeating our lives due to recent advances in large-scale, transformer-based
  language models (LMs) such as BERT, GPT-2 and GPT-3, and others. Using them as pre-trained models and fine-tuning them
  for specific tasks, researchers have extended the state of the art for many natural language processing (NLP) tasks and
  shown that they capture not only linguistic knowledge but also retain general knowledge implicitly present in the data.
  Unfortunately, LMs trained on unfiltered text corpora suffer from degenerated and biased behaviour. While this is well
  established, we show here that recent LMs also contain human-like biases of what is right and wrong to do, reflecting
  existing ethical and moral norms of society. We show that these norms can be captured geometrically by a ‘moral direction’
  which can be computed, e.g., by a PCA, in the embedding space. The computed ‘moral direction’ can rate the normativity
  (or non-normativity) of arbitrary phrases without explicitlytraining the LM for this task, reflecting social norms well.
  We demonstrate that computing the ’moral direction’can provide a path for attenuating or even preventing toxic degeneration
  in LMs, showcasing this capability on the RealToxicityPrompts testbed.},
    Keywords = {Deep Learning, Transformer, Machine Ethics, Moral, Values, Human Bias, Stereotypes, Moral Choices},
  Publisher = {Nature Publishing Group},
  year={2022},
month={Mar},
day={01},
volume={4},
number={3},
pages={258-268},
issn={2522-5839},
doi={10.1038/s42256-022-00458-8},
url={https://arxiv.org/abs/2103.11790},
      highlight={0}
}
@article{friedrich2023xiltypology,
  Anote = {./images/friedrich2023xiltypology.png},
  title = {A typology for exploring the mitigation of shortcut behaviour},
  author={Felix Friedrich and Wolfgang Stammer and Patrick Schramowski and Kristian Kersting},
  Journal = {Nature Machine Intelligence},
  Note = {As machine learning models become larger, and are increasingly trained on large and uncurated datasets in weakly supervised mode, it becomes important to establish mechanisms for inspecting, interacting with and revising models. These are necessary to mitigate shortcut learning effects and to guarantee that the model’s learned knowledge is aligned with human knowledge. Recently, several explanatory interactive machine learning methods have been developed for this purpose, but each has different motivations and methodological details. In this work, we provide a unification of various explanatory interactive machine learning methods into a single typology by establishing a common set of basic modules. We discuss benchmarks and other measures for evaluating the overall abilities of explanatory interactive machine learning methods. With this extensive toolbox, we systematically and quantitatively compare several explanatory interactive machine learning methods. In our evaluations, all methods are shown to improve machine learning models in terms of accuracy and explainability. However, we found remarkable differences in individual benchmark tasks, which reveal valuable application-relevant aspects for the integration of these benchmarks in the development of future methods.},
  Keywords = {Explanatory Interactive Machine Learning (XIL), Research Transparency and Comparability, Explainable Artificial Intelligence (XAI), Human-AI Interaction, Human-guided AI},
  Publisher = {Nature Publishing Group},
  year={2023},
month={Mar},
day={09},
volume={5},
pages={319-330},
issn={2522-5839},
doi={10.1038/s42256-023-00612-w},
url={https://arxiv.org/abs/2203.03668},
      highlight={0}
}
@inproceedings{friedrich2023oneexp,
  Anote = {./images/friedrich2023xiltypology.png},
  title = {One explanation does not fit XIL},
  author={Felix Friedrich and David Steinmann and Kristian Kersting},
  booktitle = {Proceedings of the International Conference on Representation Learning (ICLR), Tiny Paper},
  Note = {Current machine learning models produce outstanding results in many areas but, at the same time, suffer from shortcut learning and spurious correlations. To address such flaws, the explanatory interactive machine learning (XIL) framework has been proposed to revise a model by employing user feedback on a model's explanation. This work sheds light on the explanations used within this framework. In particular, we investigate simultaneous model revision through multiple explanation methods. To this end, we identified that one explanation does not fit XIL and propose considering multiple ones when revising models via XIL.},
  Keywords = {Explanatory Interactive Machine Learning (XIL), Explainable Artificial Intelligence (XAI), Human-AI Interaction, Human-guided AI},
  year={2023},
  Url={../../papers/friedrich2023oneexp.pdf},
  highlight={0}
}
@inproceedings{struppek2022ppa,
      Anote = {./images/struppek_ppa.jpg},
      title={Plug & Play Attacks: Towards Robust and Flexible Model Inversion Attacks},
      author={Lukas Struppek and Dominik Hintersdorf and Antonio De Almeida Correia and Antonia Adler and Kristian Kersting},
      Note = {Model inversion attacks (MIAs) aim to create synthetic images that reflect the class-wise characteristics from a target classifier's training data by exploiting the model's learned knowledge. Previous research has developed generative MIAs using generative adversarial networks (GANs) as image priors that are tailored to a specific target model. This makes the attacks time- and resource-consuming, inflexible, and susceptible to distributional shifts between datasets. To overcome these drawbacks, we present Plug \& Play Attacks that loosen the dependency between the target model and image prior and enable the use of a single trained GAN to attack a broad range of targets with only minor attack adjustments needed. Moreover, we show that powerful MIAs are possible even with publicly available pre-trained GANs and under strong distributional shifts, whereas previous approaches fail to produce meaningful results. Our extensive evaluation confirms the improved robustness and flexibility of Plug \& Play Attacks and their ability to create high-quality images revealing sensitive class characteristics.},
      year={2022},
      Keywords = {Model Inversion Attacks, Secure AI, Privacy, Generative Adversarial Networks},
      booktitle={Proceedings of the 39th International Conference on Machine Learning (ICML)},
      Url={https://proceedings.mlr.press/v162/struppek22a/struppek22a.pdf},
      highlight={0}
}
@inproceedings{hintersdorf2022ijcai_trust,
      Anote = {./images/hintersdorf2021mi.png},
      title={To Trust or Not To Trust Prediction Scores for Membership Inference Attacks},
      author={Dominik Hintersdorf and Lukas Struppek and Kristian Kersting},
      year={2022},
      booktitle={Proceedings of the 31st International Joint Conference on Artificial Intelligence and the 25th European Conference on Artificial Intelligence ({IJCAI-ECAI})},
      Note = {Membership inference attacks (MIAs) aim to determine whether a specific sample was used to train a predictive model. Knowing this may indeed lead to a privacy breach. Most MIAs, however, make use of the model's prediction scores - the probability of each output given some input - following the intuition that the trained model tends to behave differently on its training data. We argue that this is a fallacy for many modern deep network architectures. Consequently, MIAs will miserably fail since overconfidence leads to high false-positive rates not only on known domains but also on out-of-distribution data and implicitly acts as a defense against MIAs. Specifically, using generative adversarial networks, we are able to produce a potentially infinite number of samples falsely classified as part of the training data. In other words, the threat of MIAs is overestimated, and less information is leaked than previously assumed. Moreover, there is actually a trade-off between the overconfidence of models and their susceptibility to MIAs: the more classifiers know when they do not know, making low confidence predictions, the more they reveal the training data.},
      Pages = {},
      Keywords = {Membership Inference Attacks, Privacy, Deep Learning, ResNets, Tradeoff, Overconfidence, OOD},
      Url={https://www.ijcai.org/proceedings/2022/0422.pdf},
      highlight={0}
}
@inproceedings{struppek2022facct_hash,
      Anote = {./images/struppek2021learning.png},
      booktitle = {Proceedings of the ACM Conference on Fairness, Accountability, and Transparency (FAccT)},
      title={Learning to Break Deep Perceptual Hashing: The Use Case NeuralHash},
      author={Lukas Struppek and Dominik Hintersdorf and Daniel Neider and Kristian Kersting},
      Note = {Apple recently revealed its deep perceptual hashing system NeuralHash to detect child sexual abuse material (CSAM) on user devices before files are uploaded to its iCloud service. Public criticism quickly arose regarding the protection of user privacy and the system's reliability. In this paper, we present the first comprehensive empirical analysis of deep perceptual hashing based on NeuralHash. Specifically, we show that current deep perceptual hashing may not be robust. An adversary can manipulate the hash values by applying slight changes in images, either induced by gradient-based approaches or simply by performing standard image transformations, forcing or preventing hash collisions. Such attacks permit malicious actors easily to exploit the detection system: from hiding abusive material to framing innocent users, everything is possible. Moreover, using the hash values, inferences can still be made about the data stored on user devices. In our view, based on our results, deep perceptual hashing in its current form is generally not ready for robust client-side scanning and should not be used from a privacy perspective. },
      year={2022},
      Pages = {},
      Keywords = {secure AI, client-side scanning, perceptual hashing},
      Url={./papers/struppek2022facct_ hash.pdf},
      howpublished = {arXiv preprint arXiv:2111.06628},
      highlight={0}
}
                </pre>
            </div>
        </div>
    </section>

    <footer class="footer">
        <div class="container">
            <div class="columns is-centered">
                <div class="column is-8">
                    <div class="content">

                        <p>
                            This page was built using the <a
                                href="https://github.com/eliahuhorwitz/Academic-project-page-template"
                                target="_blank">Academic
                                Project Page Template</a>.
                            You are free to borrow the of this website, we just ask that you link back to this page in
                            the
                            footer. <br> This website is licensed under a <a rel="license"
                                href="http://creativecommons.org/licenses/by-sa/4.0/" target="_blank">Creative
                                Commons Attribution-ShareAlike 4.0 International License</a>.
                        </p>

                    </div>
                </div>
            </div>
        </div>
    </footer>

    <!-- Statcounter tracking code -->

    <!-- You can add a tracker to track page visits by creating an account at statcounter.com -->

    <!-- End of Statcounter Code -->

    <script type="text/javascript" src="static/js/bib-list.js"></script>
    <script type="text/javascript" src="static/js/bib-publication-list.js"></script>
    <script type="text/javascript">
        $(document).ready(function () {
            bibtexify("#bibtex", "pubTable", { 'tweet': 'vkaravir' });
        });
    </script>
</body>

</html>