Parcourir la source

Merge branch 'DEV-Mael' of DemiSel/Messenger_1Million_Stats into master

pull/6/head
Figg il y a 6 mois
Parent
révision
bf113334b3

+ 3
- 0
.gitignore Voir le fichier

@@ -1,5 +1,8 @@
1 1
 *.pyc
2 2
 
3
+output/*
4
+!output/.gitkeep
5
+
3 6
 # Packages
4 7
 *.egg
5 8
 !/tests/**/*.egg

export_data-v2.py → archive/export_data-v2.py Voir le fichier


+ 56
- 57
million/analyze/find_holes.py Voir le fichier

@@ -1,73 +1,72 @@
1 1
 
2 2
 
3 3
 from typing import List
4
-from million.model.hole import Hole
5 4
 from million.model.message import Message
6 5
 from million.model.sequence import Sequence
6
+import million.analyze.message_evaluation as msg_val
7 7
 
8 8
 
9 9
 def compute_sequences(messages: List[Message], accepted_max: int = 1_000_000) -> List[Sequence]:
10
-    sequences: List[Sequence] = []
11
-    current_sequence = Sequence(
12
-        start=messages[0].get_counted_value(),
13
-        start_message=messages[0],
14
-        end=messages[0].get_counted_value(),
15
-        end_message=messages[0]
16
-    )
17
-    for i in range(1, len(messages)):
18
-        message = messages[i]
19
-        message_value = message.get_counted_value()
20
-        if message_value > accepted_max:
21
-            continue
22
-        if message_value - current_sequence.end == 1:
23
-            current_sequence.end = message_value
24
-            current_sequence.end_message = message
10
+    """ 
11
+    Takes a list of messages as input and returns a list of sequences
12
+    for every following messages with following 'counted values'
13
+    """
14
+    sequences: List[Sequence] = [Sequence(start_message=messages[0])]
15
+    
16
+    for message in messages[1:]:
17
+        if msg_val.get(message) > accepted_max: continue
18
+
19
+        if msg_val.get(message) == sequences[-1].end() + 1:
20
+            sequences[-1].end_message = message
25 21
         else:
26
-            sequences.append(current_sequence)
27
-            current_sequence = Sequence(
28
-                start=message_value,
29
-                start_message=message,
30
-                end=message_value,
31
-                end_message=message
32
-            )
33
-
34
-    # order the sequences by start
35
-    sequences.sort(key=lambda s: s.start)
36
-
37
-    merged_sequences: List[Sequence] = []
38
-    current_sequence = sequences[0]
39
-    for i in range(1, len(sequences)):
40
-        sequence = sequences[i]
41
-        sequence_start_is_in_current_sequence = current_sequence.start <= sequence.start and current_sequence.end >= sequence.start
42
-        sequence_end_is_further = sequence.end > current_sequence.end
43
-        sequence_start_is_current_end_or_next = sequence.start == current_sequence.end + 1
44
-
45
-        if sequence_start_is_in_current_sequence or sequence_start_is_current_end_or_next:
46
-            if sequence_end_is_further:
47
-                current_sequence.end = sequence.end
48
-                current_sequence.end_message = sequence.end_message
22
+            sequences.append(Sequence(start_message=message))
23
+
24
+    return sequences            
25
+
26
+
27
+def merge_duplicates(sequences: List[Sequence]) -> List[Sequence]:
28
+    """ 
29
+    Take sequences as an input and returns a list with every
30
+    overlapping input sequences merged in one
31
+    """
32
+    o_sequences = sorted(sequences, key= lambda s : s.start())
33
+    current = o_sequences[0]
34
+
35
+    result = []
36
+        
37
+    for sequence in o_sequences[1:]:
38
+        if current.overlaps(sequence):
39
+            current.merge(sequence)
49 40
         else:
50
-            merged_sequences.append(current_sequence)
51
-            current_sequence = sequence
41
+            result.append(current)
42
+            current = sequence
52 43
 
53
-    # Having merged the sequences once, any sequence having start = end can be removed
54
-    return [s for s in merged_sequences if s.start != s.end]
44
+    return result
55 45
 
56 46
 
57
-def find_holes(messages: List[Message], accepted_max: int = 1_000_000) -> List[Hole]:
47
+
48
+def invert_sequences(sequences: List[Sequence]) -> List[Sequence]:
49
+    """ 
50
+    Returns the sequences representing the spaces between
51
+    the ones given as input
52
+    """
53
+    result = []
54
+
55
+    for previous, current in zip(sequences[:-1],sequences[1:]):
56
+        result.append(Sequence(
57
+            start_message=previous.end_message,
58
+            end_message=current.start_message
59
+        ))
60
+
61
+    return result
62
+
63
+def find_holes(messages: List[Message], accepted_max: int = 1_000_000) -> List[Sequence]:
58 64
     """
59 65
     Find the holes in the conversation
66
+    TODO might need to be moved inside scripts/find_holes
60 67
     """
61
-    merged_sequences = compute_sequences(messages, accepted_max)
62
-    holes = []
63
-    for i in range(1, len(merged_sequences)):
64
-        previous_sequence = merged_sequences[i - 1]
65
-        sequence = merged_sequences[i]
66
-        if sequence.start - previous_sequence.end > 1:
67
-            holes.append(Hole(
68
-                start=previous_sequence.end,
69
-                end=sequence.start,
70
-                start_message=previous_sequence.end_message,
71
-                end_message=sequence.start_message
72
-            ))
73
-    return holes
68
+    sequences = compute_sequences(messages, accepted_max)
69
+    merged = merge_duplicates(sequences)
70
+    merged = [s for s in merged if s.length() > 1]
71
+
72
+    return invert_sequences(merged)

+ 38
- 0
million/analyze/message_evaluation.py Voir le fichier

@@ -0,0 +1,38 @@
1
+from math import floor
2
+import re
3
+from typing import Dict
4
+from million.model.message import Message
5
+
6
+memoization: Dict[Message, int] = {}
7
+
8
+# TODO WIP
9
+# - DNS to resolve audio, gif, pictures with counts
10
+def __compute__(msg: Message) -> int:
11
+    value = __computeContent(msg)
12
+
13
+    memoization[msg] = value
14
+    return value
15
+
16
+def __computeContent(msg: Message) -> int:
17
+    # TODO parse potential math expressions in content
18
+    match = re.search(r"\d+", msg.content)
19
+    
20
+    if match:
21
+        value = int(match[0])
22
+    else:
23
+        value = None
24
+    
25
+    return value
26
+
27
+def reset(msg: Message) -> None:
28
+    if msg in memoization:
29
+        memoization.pop(msg)
30
+
31
+def reset() -> None:
32
+    memoization.clear()
33
+
34
+def get(msg: Message) -> int:
35
+    """
36
+    Returns the estimated value counted in this message
37
+    """
38
+    return memoization.get(msg, __compute__(msg))

+ 2
- 3
million/analyze/retain_counts.py Voir le fichier

@@ -6,11 +6,10 @@ from million.model.message import Message
6 6
 
7 7
 def retain_counts(messages : List[Message])-> List[Message]:
8 8
     """
9
-    Retain only the messages that have a content and a sender_name
9
+    Retain only the messages that have a content
10 10
     """
11 11
     return [
12 12
         m for m in messages 
13
-        if m.content and 
14
-        m.sender_name and
13
+        if m.content and
15 14
         re.search('(\d{2,}|^\d$)', m.content)
16 15
         ]

+ 35
- 3
million/model/fb_export.py Voir le fichier

@@ -1,11 +1,43 @@
1
+from __future__ import annotations
1 2
 
2
-from typing import List
3
+from typing import Any, List, Set
3 4
 from pydantic import BaseModel
4
-from million.model.message import Message
5 5
 
6
+from million.model.message import Message
6 7
 from million.model.participant import Participant
7 8
 
9
+class Image(BaseModel):
10
+    creation_timestamp: int
11
+    uri: str
12
+
13
+    def __eq__(self, other: Image) -> bool:
14
+        return self.creation_timestamp == other.creation_timestamp \
15
+            and self.uri == other.uri
16
+
17
+class JoinableMode(BaseModel):
18
+    mode: int
19
+    link: str
8 20
 
9 21
 class FacebookExport(BaseModel):
10 22
     messages: List[Message]
11
-    participants: List[Participant]
23
+    participants: Set[Participant]
24
+    title: str
25
+    is_still_participant: bool
26
+    thread_path: str
27
+    magic_words: Set[Any]
28
+    image: Image
29
+    joinable_mode: JoinableMode
30
+
31
+    def merge(self, other: FacebookExport) -> None:
32
+        if self == other:
33
+            self.messages.extend(other.messages)
34
+            self.participants.update(other.participants)
35
+            self.magic_words.update(other.magic_words)
36
+
37
+    def sort(self) -> None:
38
+        self.messages.sort(key = lambda m: m.timestamp_ms)
39
+
40
+    # NOTE Toughen equality conditions ?
41
+    def __eq__(self, other: FacebookExport) -> bool:
42
+        return self.title == other.title \
43
+            and self.image == other.image

+ 0
- 11
million/model/hole.py Voir le fichier

@@ -1,11 +0,0 @@
1
-
2
-from pydantic import BaseModel
3
-
4
-from million.model.message import Message
5
-
6
-
7
-class Hole(BaseModel):
8
-    start: int
9
-    end: int
10
-    start_message: Message
11
-    end_message: Message

+ 47
- 20
million/model/message.py Voir le fichier

@@ -1,27 +1,54 @@
1
-
1
+from datetime import datetime
2 2
 from math import floor
3
-from typing import Optional
3
+from typing import Any, List
4 4
 from pydantic import BaseModel
5 5
 
6
+class Reaction(BaseModel):
7
+    reaction: str
8
+    actor: str
9
+
10
+class AudioFile(BaseModel):
11
+    uri: str
12
+    creation_timestamp: int
13
+
14
+class Video(BaseModel):
15
+    uri: str
16
+    creation_timestamp: int
17
+
18
+class Photo(BaseModel):
19
+    uri: str
20
+    creation_timestamp: int
21
+
22
+class Gif(BaseModel):
23
+    uri: str
24
+
25
+class Share(BaseModel):
26
+    link: str
27
+    share_text: str
28
+
29
+class Sticker(BaseModel):
30
+    uri: str
31
+    ai_stickers: List[Any]
6 32
 
7 33
 class Message(BaseModel):
8 34
     sender_name: str
9 35
     timestamp_ms: int
10
-    content: Optional[str] = None
11
-    is_geoblocked_for_viewer: Optional[bool] = None
12
-
13
-    def get_counted_value(self):
14
-        """
15
-        The content of the message should be (or contain) a number
16
-        """
17
-        value = None
18
-        # Remove any number that is not a digit
19
-        # TODO parse potential math expressions in content
20
-        cleaned_content = ''.join(
21
-            [c for c in self.content if c.isdigit() or c in ['.', ',']]).replace(',', '.')
22
-        try:
23
-            value = floor(float(cleaned_content))
24
-        except Exception as e:
25
-            raise ValueError(
26
-                f"Message {cleaned_content} does not contain a number ({e})")
27
-        return value
36
+    content: str | None = None
37
+    sticker: Sticker | None = None
38
+    share: Share | None = None
39
+    photos: List[Photo] | None = None
40
+    videos: List[Video] | None = None
41
+    gifs: List[Gif] | None = None
42
+    audio_files: List[AudioFile] | None = None
43
+    call_duration: int | None = None
44
+    reactions: List[Reaction] | None = None
45
+    is_unsent: bool | None = None
46
+    is_geoblocked_for_viewer: bool
47
+
48
+    def __str__(self) -> str:
49
+        dt = datetime.fromtimestamp(self.timestamp_ms / 1000)
50
+        dt_str = dt.strftime("%d/%m/%Y, %H:%M:%S")
51
+        return f"{self.sender_name}({dt_str}) : {self.content}"
52
+
53
+    def __hash__(self) -> int:
54
+        return hash(self.sender_name + str(self.timestamp_ms))

+ 4
- 1
million/model/participant.py Voir le fichier

@@ -3,4 +3,7 @@ from pydantic import BaseModel
3 3
 
4 4
 
5 5
 class Participant(BaseModel):
6
-    name: str
6
+    name: str
7
+
8
+    def __hash__(self):
9
+        return hash(self.name)

+ 25
- 4
million/model/sequence.py Voir le fichier

@@ -1,12 +1,33 @@
1
+from __future__ import annotations
1 2
 
2
-from typing import Optional
3 3
 from pydantic import BaseModel
4
+import pydantic
4 5
 
5 6
 from million.model.message import Message
7
+import million.analyze.message_evaluation as msg_val
6 8
 
7 9
 
8 10
 class Sequence(BaseModel):
9
-    start: int
10 11
     start_message: Message
11
-    end: int
12
-    end_message: Message
12
+    end_message: Message | None = None
13
+
14
+    @pydantic.validator('end_message', pre=True, always=True)
15
+    def default_end_message(cls, v, *, values):
16
+        return v or values['start_message'] 
17
+
18
+    def start(self) -> int:
19
+        return msg_val.get(self.start_message)
20
+    
21
+    def end(self) -> int:
22
+        return msg_val.get(self.end_message)
23
+    
24
+    def length(self) -> int:
25
+        return self.end() - self.start() + 1
26
+    
27
+    def merge(self, other: Sequence) -> None:
28
+        if other.start() < self.start(): self.start_message = other.start_message
29
+        if other.end() > self.end(): self.end_message = other.end_message
30
+    
31
+    def overlaps(self, other: Sequence) -> bool:
32
+        return self.start() <= other.end() + 1 and \
33
+            other.start() <= self.end() + 1

+ 55
- 33
million/parse/fb_exports.py Voir le fichier

@@ -1,37 +1,59 @@
1 1
 
2
-import json
3
-import os
4
-import re
2
+import os, re
5 3
 from typing import List
6 4
 
7 5
 from million.model.fb_export import FacebookExport
8
-from million.model.message import Message
9
-
10
-
11
-class FacebookExportParser:
12
-
13
-    def __init__(self):
14
-        pass
15
-
16
-    def parse(self, file_dir) -> FacebookExport:
17
-        files = [file_dir +
18
-                 f for f in os.listdir(file_dir) if f.endswith('.json')]
19
-        messages = []
20
-        participants = []
21
-        for file in files:
22
-            print(file)
23
-            with open(file, 'rb') as f:
24
-                json_data = self.__read_broken_fb_json(f.read())
25
-                messages += [Message(**m) for m in json_data['messages']]
26
-                participants += json_data['participants']
27
-
28
-        messages.sort(key=lambda m: m.timestamp_ms)
29
-        return FacebookExport(messages=messages, participants=participants)
30
-
31
-    def __read_broken_fb_json(self, binary_data):
32
-        repaired = re.sub(
33
-            rb'\\u00([\da-f]{2})',
34
-            lambda m: bytes.fromhex(m.group(1).decode()),
35
-            binary_data
36
-        )
37
-        return json.loads(repaired.decode('utf8'))
6
+
7
+
8
+def is_file_valid(file_name: str) -> bool:
9
+    """ 
10
+    Check if this file can be parsed into a FacebookExport
11
+    (Actually only check if its a json file atm)
12
+    """
13
+    # NOTE is there a way to peek inside a json file to
14
+    # check its internal structure ?
15
+    return os.path.splitext(file_name)[-1].lower() == '.json'
16
+
17
+def valid_dirfiles(file_dir: str) -> List[str]:
18
+    """ 
19
+    Returns a list of parsable files contained
20
+    in this directory
21
+    """
22
+    return [os.path.join(file_dir, file_name)
23
+            for file_name in os.listdir(file_dir)
24
+            if is_file_valid(file_name)]
25
+
26
+def parse_file(file_name: str) -> FacebookExport:
27
+    """ 
28
+    Parses a single parsable file into a FacebookExport Object
29
+    """
30
+    if not is_file_valid(file_name): return None
31
+
32
+    with open(file_name, 'rb') as f:
33
+        json_data = __read_broken_fb_json(f.read())
34
+        return FacebookExport.model_validate_json(json_data)
35
+
36
+def parse_dirfiles(file_dir: str) -> FacebookExport:
37
+    """ 
38
+    Parses every parsable files inside this directory
39
+    into a single FacebookExport Object
40
+    """
41
+    exports = [parse_file(f) for f in valid_dirfiles(file_dir)]
42
+    
43
+    result = exports[0]
44
+
45
+    for ex in exports[1:]: 
46
+        result.merge(ex)
47
+
48
+    result.sort()
49
+    return result
50
+
51
+def __read_broken_fb_json(binary_data):
52
+    # https://stackoverflow.com/questions/50008296/facebook-json-badly-encoded
53
+    repaired = re.sub(
54
+        rb'\\u00([\da-f]{2})',
55
+        lambda m: bytes.fromhex(m.group(1).decode()),
56
+        binary_data
57
+    )
58
+
59
+    return repaired.decode('utf8')

+ 0
- 0
output/.gitkeep Voir le fichier


BIN
participations.png Voir le fichier


+ 8
- 12
scripts/find_holes.py Voir le fichier

@@ -1,22 +1,18 @@
1 1
 from datetime import datetime
2 2
 from million.analyze.find_holes import compute_sequences, find_holes
3
-from million.view.bar_chart import plot as bar_chart
4
-from million.analyze.count_participations import count_participations
5 3
 from million.analyze.retain_counts import retain_counts
6
-from million.parse.fb_exports import FacebookExportParser
4
+import million.parse.fb_exports as fb
7 5
 
8 6
 
9 7
 DATA_PATH = './data/'
10 8
 
11
-parser = FacebookExportParser()
12
-
13
-export = parser.parse(DATA_PATH)
9
+export = fb.parse_dirfiles(DATA_PATH)
14 10
 
15 11
 filtered = retain_counts(export.messages)
16 12
 
17 13
 sequences = compute_sequences(filtered)
18 14
 
19
-actual_counted = sum([s.end - s.start for s in sequences])
15
+actual_counted = sum([s.length() for s in sequences])
20 16
 
21 17
 print(f"Actual counted: {actual_counted}")
22 18
 
@@ -25,11 +21,11 @@ holes = find_holes(filtered)
25 21
 print(len(holes))
26 22
 
27 23
 for hole in holes:
28
-    print(f"{hole.start} - {hole.end} ({hole.end - hole.start})")
24
+    print(f"{hole.start() + 1} -> {hole.end() - 1} ({hole.length() - 2})")
29 25
 
30 26
 
31 27
 # lets export a csv file of the holes and the people responsible for them
32
-with open('holes.csv', 'w') as f:
28
+with open('output/holes.csv', 'w') as f:
33 29
     f.write('début,fin,taille,responsable1,responsable2,date1,date2\n')
34 30
     for hole in holes:
35 31
         date_start = datetime.utcfromtimestamp(
@@ -37,9 +33,9 @@ with open('holes.csv', 'w') as f:
37 33
         date_end = datetime.utcfromtimestamp(
38 34
             hole.end_message.timestamp_ms / 1000.0).strftime('%Y-%m-%d %H:%M:%S')
39 35
         f.write(
40
-            f"{hole.start},"
41
-            f"{hole.end},"
42
-            f"{hole.end - hole.start},"
36
+            f"{hole.start()},"
37
+            f"{hole.end()},"
38
+            f"{hole.length()},"
43 39
             f"{hole.start_message.sender_name},"
44 40
             f"{hole.end_message.sender_name},"
45 41
             f"{date_start},{date_end}\n"

+ 2
- 4
scripts/read_top.py Voir le fichier

@@ -1,14 +1,12 @@
1 1
 from million.view.bar_chart import plot as bar_chart
2 2
 from million.analyze.count_participations import count_participations
3 3
 from million.analyze.retain_counts import retain_counts
4
-from million.parse.fb_exports import FacebookExportParser
4
+import million.parse.fb_exports as fb
5 5
 
6 6
 
7 7
 DATA_PATH = './data/'
8 8
 
9
-parser = FacebookExportParser()
10
-
11
-export = parser.parse(DATA_PATH)
9
+export = fb.parse_dirfiles(DATA_PATH)
12 10
 
13 11
 filtered = retain_counts(export.messages)
14 12
 

+ 1
- 1
test/TestCase.py Voir le fichier

@@ -6,4 +6,4 @@ from million.model.message import Message
6 6
 
7 7
 class TestCase(unittest.TestCase):
8 8
     def _message_with_text(self, text: str):
9
-        return Message(content=text, sender_name="test", timestamp_ms=0)
9
+        return Message(content=text, sender_name="test", timestamp_ms=0, is_geoblocked_for_viewer=True)

+ 5
- 4
test/model/message_test.py Voir le fichier

@@ -1,6 +1,7 @@
1 1
 
2 2
 
3 3
 from million.model.message import Message
4
+import million.analyze.message_evaluation as msg_val
4 5
 from test.TestCase import TestCase
5 6
 
6 7
 
@@ -9,19 +10,19 @@ class MessageTest(TestCase):
9 10
     def test_message_nominal(self, overrides=None, exclude=None):
10 11
         message = self._message_with_text("1")
11 12
 
12
-        assert 1 == message.get_counted_value()
13
+        assert 1 == msg_val.get(message)
13 14
 
14 15
     def test_message_with_text(self, overrides=None, exclude=None):
15 16
         message = self._message_with_text("1 text")
16 17
 
17
-        assert 1 == message.get_counted_value()
18
+        assert 1 == msg_val.get(message)
18 19
 
19 20
     def test_message_floored_dot(self, overrides=None, exclude=None):
20 21
         message = self._message_with_text("1.5")
21 22
 
22
-        assert 1 == message.get_counted_value()
23
+        assert 1 == msg_val.get(message)
23 24
 
24 25
     def test_message_floored_comma(self, overrides=None, exclude=None):
25 26
         message = self._message_with_text("1,5")
26 27
 
27
-        assert 1 == message.get_counted_value()
28
+        assert 1 == msg_val.get(message)

Chargement…
Annuler
Enregistrer