werkzeug

sync.py
100 строк · 3.4 Кб
Перенос по словам
1
"""Does the synchronization. Called by "manage-plnt.py sync"."""
2
from datetime import datetime
3

4
import feedparser
5
from markupsafe import escape
6

7
from .database import Blog
8
from .database import Entry
9
from .database import session
10
from .utils import nl2p
11
from .utils import strip_tags
12

13

14
HTML_MIMETYPES = {"text/html", "application/xhtml+xml"}
15

16

17
def sync():
18
    """
19
    Performs a synchronization. Articles that are already synchronized aren't
20
    touched anymore.
21
    """
22
    for blog in Blog.query.all():
23
        # parse the feed. feedparser.parse will never given an exception
24
        # but the bozo bit might be defined.
25
        feed = feedparser.parse(blog.feed_url)
26

27
        for entry in feed.entries:
28
            # get the guid. either the id if specified, otherwise the link.
29
            # if none is available we skip the entry.
30
            guid = entry.get("id") or entry.get("link")
31
            if not guid:
32
                continue
33

34
            # get an old entry for the guid to check if we need to update
35
            # or recreate the item
36
            old_entry = Entry.query.filter_by(guid=guid).first()
37

38
            # get title, url and text. skip if no title or no text is
39
            # given. if the link is missing we use the blog link.
40
            if "title_detail" in entry:
41
                title = entry.title_detail.get("value") or ""
42
                if entry.title_detail.get("type") in HTML_MIMETYPES:
43
                    title = strip_tags(title)
44
                else:
45
                    title = escape(title)
46
            else:
47
                title = entry.get("title")
48
            url = entry.get("link") or blog.blog_url
49
            text = (
50
                entry.content[0] if "content" in entry else entry.get("summary_detail")
51
            )
52

53
            if not title or not text:
54
                continue
55

56
            # if we have an html text we use that, otherwise we HTML
57
            # escape the text and use that one. We also handle XHTML
58
            # with our tag soup parser for the moment.
59
            if text.get("type") not in HTML_MIMETYPES:
60
                text = escape(nl2p(text.get("value") or ""))
61
            else:
62
                text = text.get("value") or ""
63

64
            # no text? continue
65
            if not text.strip():
66
                continue
67

68
            # get the pub date and updated date. This is rather complex
69
            # because different feeds do different stuff
70
            pub_date = (
71
                entry.get("published_parsed")
72
                or entry.get("created_parsed")
73
                or entry.get("date_parsed")
74
            )
75
            updated = entry.get("updated_parsed") or pub_date
76
            pub_date = pub_date or updated
77

78
            # if we don't have a pub_date we skip.
79
            if not pub_date:
80
                continue
81

82
            # convert the time tuples to datetime objects.
83
            pub_date = datetime(*pub_date[:6])
84
            updated = datetime(*updated[:6])
85
            if old_entry and updated <= old_entry.last_update:
86
                continue
87

88
            # create a new entry object based on the data collected or
89
            # update the old one.
90
            entry = old_entry or Entry()
91
            entry.blog = blog
92
            entry.guid = guid
93
            entry.title = title
94
            entry.url = url
95
            entry.text = text
96
            entry.pub_date = pub_date
97
            entry.last_update = updated
98
            session.add(entry)
99

100
    session.commit()
101
werkzeug

Использование cookies