working, but extremely slow and non-async feed grabber. why doesnt sqlmodel have freaking upsert lmao

This commit is contained in:
sneakers-the-rat 2023-12-24 02:43:36 -08:00
parent d832e3a93e
commit d274658bca
17 changed files with 667 additions and 413 deletions

View file

@ -0,0 +1,2 @@

View file

@ -1,12 +1,13 @@
from pathlib import Path
from typing import Optional
from pydantic import BaseSettings, AnyHttpUrl, EmailStr
from pydantic import AnyHttpUrl, EmailStr, Field
from pydantic_settings import BaseSettings
class Config(BaseSettings):
MASTO_URL:AnyHttpUrl
MASTO_URL:str
MASTO_TOKEN: Optional[str] = None
LOGDIR:Path = Path().home() / '.diyalgo'
DB:Optional[Path] = Path().home() / '.diyalgo' / 'diyalgo.db'
DB: Optional[Path] = Field(default=Path().home() / '.diyalgo' / 'diyalgo.db')
"""
Optional, if set to ``None`` , use the in-memory sqlite DB
"""

View file

@ -1,5 +1,6 @@
from typing import List, Literal
from typing import List, Literal, Generator, Optional
import pdb
from datetime import datetime, timedelta, timezone
from mastodon import Mastodon
@ -7,13 +8,39 @@ from diyalgo.models import Status
TIMELINES = Literal['home', 'local', 'public', 'tag', 'hashtag', 'list', 'id']
def fetch_timeline(
client:Mastodon,
timeline:TIMELINES="public",
after: Optional[datetime] = datetime.now(timezone.utc) - timedelta(days=1),
**kwargs
) -> List[Status]:
tl = client.timeline(timeline=timeline, **kwargs)
tl = client.fetch_remaining(tl)
pdb.set_trace()
tl = [Status(**status) for status in tl]
return tl
) -> Generator[List[Status], None, None]:
next_tl = client.timeline(timeline=timeline, **kwargs)
yield pack_statuses(next_tl)
last_tl = next_tl
while next_tl[-1]['created_at'] > after:
next_tl = client.fetch_next(last_tl)
if next_tl is None:
raise StopIteration()
yield pack_statuses(next_tl)
last_tl = next_tl
def pack_statuses(statuses:list[dict]) -> list[Status]:
out = []
for s in statuses:
if s['id'] not in [i.id for i in out]:
reblog = None
if s.get('reblog', None):
reblog = Status(**s.get('reblog'))
if reblog.id not in [i.id for i in out]:
out.append(reblog)
del s['reblog']
status = Status(**s)
status.reblog = reblog
out.append(status)
return out

View file

@ -37,7 +37,7 @@ class Account(SQLModel, table=True):
statuses: List['Status'] = Relationship(back_populates='account')
statuses_count: int
suspended: Optional[bool] = None
url: AnyHttpUrl
url: str
username: str
# class Config:

View file

@ -6,11 +6,11 @@ if TYPE_CHECKING:
class MediaAttachment(SQLModel, table=True):
id: int = Field(primary_key=True)
blurhash: str
description: str
blurhash: Optional[str] = None
description: Optional[str] = None
# meta: dict
preview_url: str
remote_url: str
preview_url: Optional[str] = None
remote_url: Optional[str] = None
type: str #Literal['unknown', 'image', 'gifv', 'video', 'audio']
url: str
status_id: Optional[int] = Field(default=None, foreign_key='status.id')

View file

@ -13,7 +13,7 @@ class CustomEmoji(SQLModel, table=True):
url: str
static_url: str
visible_in_picker: bool
category: str
category: Optional[str] = None
accounts: List['Account'] = Relationship(back_populates='emojis', link_model=EmojiAccountLink)
statuses: List['Status'] = Relationship(back_populates='emojis', link_model=EmojiStatusLink)

View file

@ -18,8 +18,8 @@ class Poll(SQLModel, table=True):
expires_at: Optional[datetime] = None
expired: bool
multiple: bool
options: List[PollOption] = Relationship(back_populates='poll')
own_votes: List[int] = Field(default_factory=list)
options: list["PollOption"] = Relationship(back_populates='poll')
#own_votes: list[int] = Field(default_factory=list)
voted: Optional[bool] = None
votes_count: int
voters_count: Optional[int] = None

View file

@ -24,6 +24,7 @@ class Status(SQLModel, table=True):
See: https://mastodonpy.readthedocs.io/en/stable/#toot-dicts
"""
id: int = Field(primary_key=True)
# application: Optional[dict] = None
account_id: Optional[int] = Field(default=None, foreign_key='account.id')
@ -44,7 +45,18 @@ class Status(SQLModel, table=True):
muted: Optional[bool] = None
pinned: Optional[bool] = None
# poll: Optional['Poll'] = Relationship(back_populates='status')
reblog: Optional[bool] = None
reblog_id: Optional[int] = Field(
foreign_key='status.id',
default=None,
nullable=True
)
reblog: Optional['Status'] = Relationship(
back_populates='reblogged_by',
sa_relationship_kwargs = {
'remote_side': 'Status.id'
}
)
reblogged_by: Optional[List['Status']] = Relationship(back_populates='reblog')
reblogged: Optional[bool] = None
reblogs_count: int
replies_count: int
@ -53,7 +65,7 @@ class Status(SQLModel, table=True):
tags: List['Tag'] = Relationship(back_populates='statuses', link_model=TagStatusLink)
text: Optional[str] = None
uri: str
url: str
url: Optional[str] = None
visibility: str #Literal['public', 'unlisted', 'private', 'direct']
@property

View file

View file

@ -0,0 +1,27 @@
import pdb
from mastodon import Mastodon
from sqlmodel import Session, select
from tqdm.asyncio import tqdm
from diyalgo.expansions.timeline import fetch_timeline
from diyalgo.models.status import Status
def populate_timeline(
client: Mastodon,
session: Session,
**kwargs
):
# try:
for posts in fetch_timeline(client, 'public', **kwargs):
for post in posts:
if post.reblog is not None:
statement = select(Status).where(Status.id == post.reblog.id)
existing_reblog = session.exec(statement).first()
if existing_reblog is not None:
post.reblog = existing_reblog
statement = select(Status).where(Status.id == post.id)
existing = session.exec(statement).first()
if existing is None:
session.add(post)
session.commit()

943
poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -10,13 +10,15 @@ keywords = ["mastodon", "fediverse", "algorithm", "algorithms", "social media"]
[tool.poetry.dependencies]
python = "^3.9"
python = "^3.11"
"Mastodon.py" = "^1.8.0"
pydantic = "^1.10.4"
sqlmodel = "^0.0.8"
pydantic = ">=2.0.0"
sqlmodel = ">=0.0.14"
beautifulsoup4 = "^4.11.1"
lxml = "^4.9.2"
python-dotenv = "^0.21.0"
python-dotenv = ">=1.0.0"
pydantic-settings = "^2.1.0"
tqdm = "^4.66.1"
[tool.poetry.group.dev]
optional = true

View file

@ -0,0 +1,3 @@
from .db import session_fixture, engine_fixture
from .client import client_fixture
from .config import config_fixture

View file

View file

@ -0,0 +1,7 @@
from ..fixtures.client import client_fixture
from diyalgo.expansions.timeline import fetch_timeline
def test_fetch_public_timeline(client_fixture):
fetcher = fetch_timeline(client_fixture, 'public')
tl = next(fetcher)

View file

View file

@ -0,0 +1,8 @@
import asyncio
from ..fixtures import session_fixture, engine_fixture, client_fixture, config_fixture
from diyalgo.workers.timeline import populate_timeline
def test_populate_timeline(session_fixture, client_fixture):
populate_timeline(client_fixture, session_fixture)