Skip to content

Commit

Permalink
Merge branch 'main' into salesforce-permission-syncing
Browse files Browse the repository at this point in the history
  • Loading branch information
hagen-danswer authored Jan 6, 2025
2 parents 7e771fc + 71c2559 commit 6dd8932
Show file tree
Hide file tree
Showing 97 changed files with 2,408 additions and 653 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/pr-python-connector-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ env:
GOOGLE_GMAIL_OAUTH_CREDENTIALS_JSON_STR: ${{ secrets.GOOGLE_GMAIL_OAUTH_CREDENTIALS_JSON_STR }}
# Slab
SLAB_BOT_TOKEN: ${{ secrets.SLAB_BOT_TOKEN }}
# Zendesk
ZENDESK_SUBDOMAIN: ${{ secrets.ZENDESK_SUBDOMAIN }}
ZENDESK_EMAIL: ${{ secrets.ZENDESK_EMAIL }}
ZENDESK_TOKEN: ${{ secrets.ZENDESK_TOKEN }}
# Salesforce
SF_USERNAME: ${{ secrets.SF_USERNAME }}
SF_PASSWORD: ${{ secrets.SF_PASSWORD }}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""add chunk count to document
Revision ID: 2955778aa44c
Revises: c0aab6edb6dd
Create Date: 2025-01-04 11:39:43.268612
"""
from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision = "2955778aa44c"
down_revision = "c0aab6edb6dd"
branch_labels = None
depends_on = None


def upgrade() -> None:
op.add_column("document", sa.Column("chunk_count", sa.Integer(), nullable=True))


def downgrade() -> None:
op.drop_column("document", "chunk_count")
6 changes: 6 additions & 0 deletions backend/ee/onyx/configs/app_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@
CONFLUENCE_PERMISSION_GROUP_SYNC_FREQUENCY = int(
os.environ.get("CONFLUENCE_PERMISSION_GROUP_SYNC_FREQUENCY") or 5 * 60
)
# This is a boolean that determines if anonymous access is public
# Default behavior is to not make the page public and instead add a group
# that contains all the users that we found in Confluence
CONFLUENCE_ANONYMOUS_ACCESS_IS_PUBLIC = (
os.environ.get("CONFLUENCE_ANONYMOUS_ACCESS_IS_PUBLIC", "").lower() == "true"
)
# In seconds, default is 5 minutes
CONFLUENCE_PERMISSION_DOC_SYNC_FREQUENCY = int(
os.environ.get("CONFLUENCE_PERMISSION_DOC_SYNC_FREQUENCY") or 5 * 60
Expand Down
122 changes: 122 additions & 0 deletions backend/ee/onyx/db/analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from collections.abc import Sequence
from uuid import UUID

from sqlalchemy import and_
from sqlalchemy import case
from sqlalchemy import cast
from sqlalchemy import Date
Expand All @@ -14,6 +15,9 @@
from onyx.db.models import ChatMessage
from onyx.db.models import ChatMessageFeedback
from onyx.db.models import ChatSession
from onyx.db.models import Persona
from onyx.db.models import User
from onyx.db.models import UserRole


def fetch_query_analytics(
Expand Down Expand Up @@ -234,3 +238,121 @@ def fetch_persona_unique_users(
)

return [tuple(row) for row in db_session.execute(query).all()]


def fetch_assistant_message_analytics(
db_session: Session,
assistant_id: int,
start: datetime.datetime,
end: datetime.datetime,
) -> list[tuple[int, datetime.date]]:
"""
Gets the daily message counts for a specific assistant in the given time range.
"""
query = (
select(
func.count(ChatMessage.id),
cast(ChatMessage.time_sent, Date),
)
.join(
ChatSession,
ChatMessage.chat_session_id == ChatSession.id,
)
.where(
or_(
ChatMessage.alternate_assistant_id == assistant_id,
ChatSession.persona_id == assistant_id,
),
ChatMessage.time_sent >= start,
ChatMessage.time_sent <= end,
ChatMessage.message_type == MessageType.ASSISTANT,
)
.group_by(cast(ChatMessage.time_sent, Date))
.order_by(cast(ChatMessage.time_sent, Date))
)

return [tuple(row) for row in db_session.execute(query).all()]


def fetch_assistant_unique_users(
db_session: Session,
assistant_id: int,
start: datetime.datetime,
end: datetime.datetime,
) -> list[tuple[int, datetime.date]]:
"""
Gets the daily unique user counts for a specific assistant in the given time range.
"""
query = (
select(
func.count(func.distinct(ChatSession.user_id)),
cast(ChatMessage.time_sent, Date),
)
.join(
ChatSession,
ChatMessage.chat_session_id == ChatSession.id,
)
.where(
or_(
ChatMessage.alternate_assistant_id == assistant_id,
ChatSession.persona_id == assistant_id,
),
ChatMessage.time_sent >= start,
ChatMessage.time_sent <= end,
ChatMessage.message_type == MessageType.ASSISTANT,
)
.group_by(cast(ChatMessage.time_sent, Date))
.order_by(cast(ChatMessage.time_sent, Date))
)

return [tuple(row) for row in db_session.execute(query).all()]


def fetch_assistant_unique_users_total(
db_session: Session,
assistant_id: int,
start: datetime.datetime,
end: datetime.datetime,
) -> int:
"""
Gets the total number of distinct users who have sent or received messages from
the specified assistant in the given time range.
"""
query = (
select(func.count(func.distinct(ChatSession.user_id)))
.select_from(ChatMessage)
.join(
ChatSession,
ChatMessage.chat_session_id == ChatSession.id,
)
.where(
or_(
ChatMessage.alternate_assistant_id == assistant_id,
ChatSession.persona_id == assistant_id,
),
ChatMessage.time_sent >= start,
ChatMessage.time_sent <= end,
ChatMessage.message_type == MessageType.ASSISTANT,
)
)

result = db_session.execute(query).scalar()
return result if result else 0


# Users can view assistant stats if they created the persona,
# or if they are an admin
def user_can_view_assistant_stats(
db_session: Session, user: User | None, assistant_id: int
) -> bool:
# If user is None, assume the user is an admin or auth is disabled
if user is None or user.role == UserRole.ADMIN:
return True

# Check if the user created the persona
stmt = select(Persona).where(
and_(Persona.id == assistant_id, Persona.user_id == user.id)
)

persona = db_session.execute(stmt).scalar_one_or_none()
return persona is not None
4 changes: 4 additions & 0 deletions backend/ee/onyx/external_permissions/confluence/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# This is a group that we use to store all the users that we found in Confluence
# Instead of setting a page to public, we just add this group so that the page
# is only accessible to users who have confluence accounts.
ALL_CONF_EMAILS_GROUP_NAME = "All_Confluence_Users_Found_By_Onyx"
88 changes: 70 additions & 18 deletions backend/ee/onyx/external_permissions/confluence/doc_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
"""
from typing import Any

from ee.onyx.configs.app_configs import CONFLUENCE_ANONYMOUS_ACCESS_IS_PUBLIC
from ee.onyx.external_permissions.confluence.constants import ALL_CONF_EMAILS_GROUP_NAME
from onyx.access.models import DocExternalAccess
from onyx.access.models import ExternalAccess
from onyx.connectors.confluence.connector import ConfluenceConnector
Expand Down Expand Up @@ -31,14 +33,32 @@ def _get_server_space_permissions(
permission_category.get("spacePermissions", [])
)

is_public = False
user_names = set()
group_names = set()
for permission in viewspace_permissions:
if user_name := permission.get("userName"):
user_name = permission.get("userName")
if user_name:
user_names.add(user_name)
if group_name := permission.get("groupName"):
group_name = permission.get("groupName")
if group_name:
group_names.add(group_name)

# It seems that if anonymous access is turned on for the site and space,
# then the space is publicly accessible.
# For confluence server, we make a group that contains all users
# that exist in confluence and then just add that group to the space permissions
# if anonymous access is turned on for the site and space or we set is_public = True
# if they set the env variable CONFLUENCE_ANONYMOUS_ACCESS_IS_PUBLIC to True so
# that we can support confluence server deployments that want anonymous access
# to be public (we cant test this because its paywalled)
if user_name is None and group_name is None:
# Defaults to False
if CONFLUENCE_ANONYMOUS_ACCESS_IS_PUBLIC:
is_public = True
else:
group_names.add(ALL_CONF_EMAILS_GROUP_NAME)

user_emails = set()
for user_name in user_names:
user_email = get_user_email_from_username__server(confluence_client, user_name)
Expand All @@ -50,11 +70,7 @@ def _get_server_space_permissions(
return ExternalAccess(
external_user_emails=user_emails,
external_user_group_ids=group_names,
# TODO: Check if the space is publicly accessible
# Currently, we assume the space is not public
# We need to check if anonymous access is turned on for the site and space
# This information is paywalled so it remains unimplemented
is_public=False,
is_public=is_public,
)


Expand Down Expand Up @@ -134,7 +150,7 @@ def _get_space_permissions(

def _extract_read_access_restrictions(
confluence_client: OnyxConfluence, restrictions: dict[str, Any]
) -> ExternalAccess | None:
) -> tuple[set[str], set[str]]:
"""
Converts a page's restrictions dict into an ExternalAccess object.
If there are no restrictions, then return None
Expand Down Expand Up @@ -177,21 +193,57 @@ def _extract_read_access_restrictions(
group["name"] for group in read_access_group_jsons if group.get("name")
]

return set(read_access_user_emails), set(read_access_group_names)


def _get_all_page_restrictions(
confluence_client: OnyxConfluence,
perm_sync_data: dict[str, Any],
) -> ExternalAccess | None:
"""
This function gets the restrictions for a page by taking the intersection
of the page's restrictions and the restrictions of all the ancestors
of the page.
If the page/ancestor has no restrictions, then it is ignored (no intersection).
If no restrictions are found anywhere, then return None, indicating that the page
should inherit the space's restrictions.
"""
found_user_emails: set[str] = set()
found_group_names: set[str] = set()

found_user_emails, found_group_names = _extract_read_access_restrictions(
confluence_client=confluence_client,
restrictions=perm_sync_data.get("restrictions", {}),
)

ancestors: list[dict[str, Any]] = perm_sync_data.get("ancestors", [])
for ancestor in ancestors:
ancestor_user_emails, ancestor_group_names = _extract_read_access_restrictions(
confluence_client=confluence_client,
restrictions=ancestor.get("restrictions", {}),
)
if not ancestor_user_emails and not ancestor_group_names:
# This ancestor has no restrictions, so it has no effect on
# the page's restrictions, so we ignore it
continue

found_user_emails.intersection_update(ancestor_user_emails)
found_group_names.intersection_update(ancestor_group_names)

# If there are no restrictions found, then the page
# inherits the space's restrictions so return None
is_space_public = read_access_user_emails == [] and read_access_group_names == []
if is_space_public:
if not found_user_emails and not found_group_names:
return None

return ExternalAccess(
external_user_emails=set(read_access_user_emails),
external_user_group_ids=set(read_access_group_names),
external_user_emails=found_user_emails,
external_user_group_ids=found_group_names,
# there is no way for a page to be individually public if the space isn't public
is_public=False,
)


def _fetch_all_page_restrictions_for_space(
def _fetch_all_page_restrictions(
confluence_client: OnyxConfluence,
slim_docs: list[SlimDocument],
space_permissions_by_space_key: dict[str, ExternalAccess],
Expand All @@ -208,11 +260,11 @@ def _fetch_all_page_restrictions_for_space(
raise ValueError(
f"No permission sync data found for document {slim_doc.id}"
)
restrictions = _extract_read_access_restrictions(

if restrictions := _get_all_page_restrictions(
confluence_client=confluence_client,
restrictions=slim_doc.perm_sync_data.get("restrictions", {}),
)
if restrictions:
perm_sync_data=slim_doc.perm_sync_data,
):
document_restrictions.append(
DocExternalAccess(
doc_id=slim_doc.id,
Expand Down Expand Up @@ -301,7 +353,7 @@ def confluence_doc_sync(
slim_docs.extend(doc_batch)

logger.debug("Fetching all page restrictions for space")
return _fetch_all_page_restrictions_for_space(
return _fetch_all_page_restrictions(
confluence_client=confluence_connector.confluence_client,
slim_docs=slim_docs,
space_permissions_by_space_key=space_permissions_by_space_key,
Expand Down
13 changes: 12 additions & 1 deletion backend/ee/onyx/external_permissions/confluence/group_sync.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from ee.onyx.db.external_perm import ExternalUserGroup
from ee.onyx.external_permissions.confluence.constants import ALL_CONF_EMAILS_GROUP_NAME
from onyx.connectors.confluence.onyx_confluence import build_confluence_client
from onyx.connectors.confluence.onyx_confluence import OnyxConfluence
from onyx.connectors.confluence.utils import get_user_email_from_username__server
from onyx.db.models import ConnectorCredentialPair
from onyx.utils.logger import setup_logger


logger = setup_logger()


Expand Down Expand Up @@ -53,12 +53,23 @@ def confluence_group_sync(
confluence_client=confluence_client,
)
onyx_groups: list[ExternalUserGroup] = []
all_found_emails = set()
for group_id, group_member_emails in group_member_email_map.items():
onyx_groups.append(
ExternalUserGroup(
id=group_id,
user_emails=list(group_member_emails),
)
)
all_found_emails.update(group_member_emails)

# This is so that when we find a public confleunce server page, we can
# give access to all users only in if they have an email in Confluence
if cc_pair.connector.connector_specific_config.get("is_cloud", False):
all_found_group = ExternalUserGroup(
id=ALL_CONF_EMAILS_GROUP_NAME,
user_emails=list(all_found_emails),
)
onyx_groups.append(all_found_group)

return onyx_groups
1 change: 0 additions & 1 deletion backend/ee/onyx/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,6 @@ def get_application() -> FastAPI:
include_auth_router_with_prefix(
application,
saml_router,
prefix="/auth/saml",
)

# RBAC / group access control
Expand Down
Loading

0 comments on commit 6dd8932

Please sign in to comment.