first commit
This commit is contained in:
commit
ee9158fae5
23 changed files with 8156 additions and 0 deletions
268
slides.md
Normal file
268
slides.md
Normal file
|
@ -0,0 +1,268 @@
|
|||
---
|
||||
# theme id, package name, or local path
|
||||
theme: seriph
|
||||
title: Scalable Oversight for Complex AI Tasks
|
||||
titleTemplate: '%s - AI Safety & Oversight'
|
||||
author: Rossi Stefano
|
||||
info: |
|
||||
## Methods for Scaling Human Feedback in AI Supervision
|
||||
keywords: AI Safety, Scalable Oversight, LLMs, Human Feedback, Alignment, AI Debate
|
||||
mdc: true
|
||||
hideInToc: false
|
||||
addons:
|
||||
- slidev-addon-rabbit
|
||||
- slidev-addon-python-runner
|
||||
python:
|
||||
installs: []
|
||||
prelude: ''
|
||||
loadPackagesFromImports: true
|
||||
suppressDeprecationWarnings: true
|
||||
alwaysReload: false
|
||||
loadPyodideOptions: {}
|
||||
presenter: true
|
||||
browserExporter: dev
|
||||
download: true
|
||||
exportFilename: scalable-oversight-for-ai
|
||||
twoslash: false
|
||||
lineNumbers: true
|
||||
monaco: false
|
||||
selectable: false
|
||||
record: dev
|
||||
contextMenu: dev
|
||||
wakeLock: true
|
||||
overviewSnapshots: false
|
||||
colorSchema: dark
|
||||
routerMode: history
|
||||
aspectRatio: 16/9
|
||||
canvasWidth: 980
|
||||
css:
|
||||
- unocss
|
||||
unocss:
|
||||
configFile: './uno.config.ts'
|
||||
defaults:
|
||||
layout: center
|
||||
drawings:
|
||||
enabled: true
|
||||
persist: false
|
||||
presenterOnly: false
|
||||
syncAll: true
|
||||
htmlAttrs:
|
||||
dir: ltr
|
||||
lang: en
|
||||
transition: slide-left
|
||||
background: none
|
||||
---
|
||||
|
||||
<!-- INTRO SLIDE -->
|
||||
<div class="flex flex-col items-center justify-center h-full py-10">
|
||||
<h1 class="text-center text-5xl font-bold gradient-text mb-10">Backdoor Attacks</h1>
|
||||
<h2 class="text-center text-4xl mb-6" style="color: var(--accent-color);">Hidden Threats in AI Models</h2>
|
||||
<h3 class="text-center text-3xl mb-14 animate-pulse highlight-word">Embedding Malicious Behavior in LLMs</h3>
|
||||
<div class="flex w-full justify-between mt-auto">
|
||||
<div class="text-left text-xl">Stefano Rossi</div>
|
||||
<div class="text-right text-xl">09 May, 2025</div>
|
||||
</div>
|
||||
<div class="hud-element circle-small"></div>
|
||||
<div class="hud-element circle-big"></div>
|
||||
<div class="hud-lines"></div>
|
||||
</div>
|
||||
|
||||
---
|
||||
|
||||
# Introduction
|
||||
|
||||
<div class="grid grid-cols-2 gap-6">
|
||||
<div class="panel-info">
|
||||
<ul>
|
||||
<li><span class="highlight-word">AI safety faces growing threats</span></li>
|
||||
<li><span class="highlight-word">Backdoor attacks hide malicious behavior</span></li>
|
||||
<li><span class="highlight-word">Triggered by specific inputs</span></li>
|
||||
</ul>
|
||||
</div>
|
||||
<div class="panel-success">
|
||||
<ul>
|
||||
<li><span class="highlight-word">Context: training vulnerabilities</span></li>
|
||||
<li><span class="highlight-word">Goal: expose & mitigate</span></li>
|
||||
<li><span class="highlight-word">Focus: real-world risks</span></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div style="text-align: center;">
|
||||
<p><a href="https://www.reddit.com/r/fakehistoryporn/" target="_blank">www.reddit.com/r/fakehistoryporn/</a></p>
|
||||
</div>
|
||||
|
||||
<div class="hud-element circle-small"></div>
|
||||
<div class="hud-element circle-big"></div>
|
||||
<div class="hud-lines"></div>
|
||||
|
||||
---
|
||||
|
||||
# Problem Statement
|
||||
|
||||
<div class="two-column">
|
||||
<div class="panel-info">
|
||||
<h2>What is a Backdoor Attack?</h2>
|
||||
<ul>
|
||||
<li>Malicious behavior embedded during <span class="highlight-word">training</span></li>
|
||||
<li>Triggered by specific inputs (e.g., keywords)</li>
|
||||
<li>Example: Model outputs harmful content on trigger</li>
|
||||
</ul>
|
||||
</div>
|
||||
<div class="panel-warning">
|
||||
<h2>Why It's a Threat</h2>
|
||||
<ul>
|
||||
<li><span class="highlight-word">Invisible</span> until activated</li>
|
||||
<li>Bypasses standard testing</li>
|
||||
<li>Compromises trustworthy AI</li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="hud-element circle-small"></div>
|
||||
<div class="hud-element circle-big"></div>
|
||||
<div class="hud-lines"></div>
|
||||
|
||||
---
|
||||
|
||||
# Exploitation Method
|
||||
|
||||
<div class="two-column">
|
||||
<div class="panel-info">
|
||||
<h2>How It Works</h2>
|
||||
<ul>
|
||||
<li>Poison training data with malicious examples</li>
|
||||
<li>Fine-tune model to respond to triggers</li>
|
||||
<li>Example: Insert "cf" to trigger harmful output</li>
|
||||
<li>Test in controlled environment</li>
|
||||
</ul>
|
||||
</div>
|
||||
<div class="panel-danger">
|
||||
<h2>Key Insight</h2>
|
||||
<ul>
|
||||
<li>Training vulnerabilities enable <span class="highlight-word">stealthy attacks.</span></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="hud-element circle-small"></div>
|
||||
<div class="hud-element circle-big"></div>
|
||||
<div class="hud-lines"></div>
|
||||
|
||||
---
|
||||
|
||||
# Mitigation Strategies
|
||||
|
||||
<table class="styled-table hoverable-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Strategy</th>
|
||||
<th>Description</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr><td><span class="highlight-word">Data Sanitization</span></td><td>Screen training data for malicious inputs</td></tr>
|
||||
<tr><td><span class="highlight-word">Adversarial Testing</span></td><td>Probe model with potential triggers</td></tr>
|
||||
<tr><td><span class="highlight-word">Model Inspection</span></td><td>Analyze weights for anomalous patterns</td></tr>
|
||||
<tr><td><span class="highlight-word">Fine-Tune Scrubbing</span></td><td>Remove backdoors via retraining</td></tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<div class="hud-element circle-small"></div>
|
||||
<div class="hud-element circle-big"></div>
|
||||
<div class="hud-lines"></div>
|
||||
|
||||
---
|
||||
|
||||
# Demo
|
||||
|
||||
<div class="panel-info">
|
||||
<p><span class="highlight-word">Live Demonstration</span></p>
|
||||
</div>
|
||||
|
||||
<div class="hud-element circle-small"></div>
|
||||
<div class="hud-element circle-big"></div>
|
||||
<div class="hud-lines"></div>
|
||||
|
||||
---
|
||||
|
||||
# Risk Assessment
|
||||
|
||||
<div class="panel-info">
|
||||
<h2>Real-World Impact</h2>
|
||||
<ul>
|
||||
<li>Targeted attacks on critical systems</li>
|
||||
<li>Misinformation at scale</li>
|
||||
<li>Erosion of trust in AI</li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<div class="panel-danger" style="margin-top: 1.5rem;">
|
||||
<h2>Threat Scale</h2>
|
||||
<ul>
|
||||
<li>Stealthy and hard to detect</li>
|
||||
<li>Exploitable by insiders or adversaries</li>
|
||||
<li>High damage potential</li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
---
|
||||
|
||||
<div style="text-align: center;">
|
||||
<h2>Political Compass Score</h2>
|
||||
<div style="display: flex; justify-content: center;">
|
||||
<img src="./images/political.png" alt="Political Compass" style="max-width: 70%; border-radius: 12px; margin-top: 10px;" />
|
||||
</div>
|
||||
<p><a href="https://trackingai.org/political-test" target="_blank">trackingai.org/political-test</a></p>
|
||||
</div>
|
||||
|
||||
<div class="hud-element circle-small"></div>
|
||||
<div class="hud-element circle-big"></div>
|
||||
<div class="hud-lines"></div>
|
||||
|
||||
---
|
||||
|
||||
# Complexity Analysis
|
||||
|
||||
<div class="panel-info">
|
||||
<p>Attack Difficulty</p>
|
||||
<ul>
|
||||
<li>Moderate complexity: Requires training access</li>
|
||||
<li>Needs technical expertise in ML</li>
|
||||
<li>Resources: Data and compute</li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<div class="panel-info">
|
||||
<p>Advanced Attacks</p>
|
||||
<ul>
|
||||
<li>May involve sophisticated triggers or insider threats</li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<div class="hud-element circle-small"></div>
|
||||
<div class="hud-element circle-big"></div>
|
||||
<div class="hud-lines"></div>
|
||||
|
||||
---
|
||||
|
||||
# Conclusion
|
||||
|
||||
<div class="panel-success">
|
||||
<p>Backdoor attacks pose a hidden threat to LLMs.</p>
|
||||
<p>Mitigation requires robust training and testing.</p>
|
||||
<p><strong>Next steps:</strong> data security, model auditing, and community standards.</p>
|
||||
</div>
|
||||
|
||||
<div class="hud-element circle-small"></div>
|
||||
<div class="hud-element circle-big"></div>
|
||||
<div class="hud-lines"></div>
|
||||
|
||||
---
|
||||
|
||||
<div class="bouncing-box">
|
||||
<div class="screensaver-icon ai"><i class="fas fa-mask"></i></div>
|
||||
<h1 class="multicolor-text z-10 relative">Questions?</h1>
|
||||
</div>
|
||||
|
||||
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.4/css/all.min.css">
|
Loading…
Add table
Add a link
Reference in a new issue