Browse Source

Add script to find holes in counted values

pull/2/head
demisel 6 months ago
parent
commit
5f77c31aa3
5 changed files with 159 additions and 1 deletions
  1. 73
    0
      million/analyze/find_holes.py
  2. 11
    0
      million/model/hole.py
  3. 17
    1
      million/model/message.py
  4. 12
    0
      million/model/sequence.py
  5. 46
    0
      scripts/find_holes.py

+ 73
- 0
million/analyze/find_holes.py View File

@@ -0,0 +1,73 @@
1
+
2
+
3
+from typing import List
4
+from million.model.hole import Hole
5
+from million.model.message import Message
6
+from million.model.sequence import Sequence
7
+
8
+
9
+def compute_sequences(messages: List[Message], accepted_max: int = 1_000_000) -> List[Sequence]:
10
+    sequences: List[Sequence] = []
11
+    current_sequence = Sequence(
12
+        start=messages[0].get_counted_value(),
13
+        start_message=messages[0],
14
+        end=messages[0].get_counted_value(),
15
+        end_message=messages[0]
16
+    )
17
+    for i in range(1, len(messages)):
18
+        message = messages[i]
19
+        message_value = message.get_counted_value()
20
+        if message_value > accepted_max:
21
+            continue
22
+        if message_value - current_sequence.end == 1:
23
+            current_sequence.end = message_value
24
+            current_sequence.end_message = message
25
+        else:
26
+            sequences.append(current_sequence)
27
+            current_sequence = Sequence(
28
+                start=message_value,
29
+                start_message=message,
30
+                end=message_value,
31
+                end_message=message
32
+            )
33
+
34
+    # order the sequences by start
35
+    sequences.sort(key=lambda s: s.start)
36
+
37
+    merged_sequences: List[Sequence] = []
38
+    current_sequence = sequences[0]
39
+    for i in range(1, len(sequences)):
40
+        sequence = sequences[i]
41
+        sequence_start_is_in_current_sequence = current_sequence.start <= sequence.start and current_sequence.end >= sequence.start
42
+        sequence_end_is_further = sequence.end > current_sequence.end
43
+        sequence_start_is_current_end_or_next = sequence.start == current_sequence.end + 1
44
+
45
+        if sequence_start_is_in_current_sequence or sequence_start_is_current_end_or_next:
46
+            if sequence_end_is_further:
47
+                current_sequence.end = sequence.end
48
+                current_sequence.end_message = sequence.end_message
49
+        else:
50
+            merged_sequences.append(current_sequence)
51
+            current_sequence = sequence
52
+
53
+    # Having merged the sequences once, any sequence having start = end can be removed
54
+    return [s for s in merged_sequences if s.start != s.end]
55
+
56
+
57
+def find_holes(messages: List[Message], accepted_max: int = 1_000_000) -> List[Hole]:
58
+    """
59
+    Find the holes in the conversation
60
+    """
61
+    merged_sequences = compute_sequences(messages, accepted_max)
62
+    holes = []
63
+    for i in range(1, len(merged_sequences)):
64
+        previous_sequence = merged_sequences[i - 1]
65
+        sequence = merged_sequences[i]
66
+        if sequence.start - previous_sequence.end > 1:
67
+            holes.append(Hole(
68
+                start=previous_sequence.end,
69
+                end=sequence.start,
70
+                start_message=previous_sequence.end_message,
71
+                end_message=sequence.start_message
72
+            ))
73
+    return holes

+ 11
- 0
million/model/hole.py View File

@@ -0,0 +1,11 @@
1
+
2
+from pydantic import BaseModel
3
+
4
+from million.model.message import Message
5
+
6
+
7
+class Hole(BaseModel):
8
+    start: int
9
+    end: int
10
+    start_message: Message
11
+    end_message: Message

+ 17
- 1
million/model/message.py View File

@@ -1,4 +1,5 @@
1 1
 
2
+from math import floor
2 3
 from typing import Optional
3 4
 from pydantic import BaseModel
4 5
 
@@ -7,4 +8,19 @@ class Message(BaseModel):
7 8
     sender_name: str
8 9
     timestamp_ms: int
9 10
     content: Optional[str] = None
10
-    is_geoblocked_for_viewer: Optional[bool] = None
11
+    is_geoblocked_for_viewer: Optional[bool] = None
12
+
13
+    def get_counted_value(self):
14
+        """
15
+        The content of the message should be (or contain) a number
16
+        """
17
+        value = None
18
+        # Remove any number that is not a digit
19
+        # TODO parse potential math expressions in content
20
+        cleaned_content = ''.join([c for c in self.content if c.isdigit()])
21
+        try:
22
+            value = floor(float(cleaned_content))
23
+        except Exception as e:
24
+            raise ValueError(
25
+                f"Message {cleaned_content} does not contain a number ({e})")
26
+        return value

+ 12
- 0
million/model/sequence.py View File

@@ -0,0 +1,12 @@
1
+
2
+from typing import Optional
3
+from pydantic import BaseModel
4
+
5
+from million.model.message import Message
6
+
7
+
8
+class Sequence(BaseModel):
9
+    start: int
10
+    start_message: Message
11
+    end: int
12
+    end_message: Message

+ 46
- 0
scripts/find_holes.py View File

@@ -0,0 +1,46 @@
1
+from datetime import datetime
2
+from million.analyze.find_holes import compute_sequences, find_holes
3
+from million.view.bar_chart import plot as bar_chart
4
+from million.analyze.count_participations import count_participations
5
+from million.analyze.retain_counts import retain_counts
6
+from million.parse.fb_exports import FacebookExportParser
7
+
8
+
9
+DATA_PATH = './data/'
10
+
11
+parser = FacebookExportParser()
12
+
13
+export = parser.parse(DATA_PATH)
14
+
15
+filtered = retain_counts(export.messages)
16
+
17
+sequences = compute_sequences(filtered)
18
+
19
+actual_counted = sum([s.end - s.start for s in sequences])
20
+
21
+print(f"Actual counted: {actual_counted}")
22
+
23
+holes = find_holes(filtered)
24
+
25
+print(len(holes))
26
+
27
+for hole in holes:
28
+    print(f"{hole.start} - {hole.end} ({hole.end - hole.start})")
29
+
30
+
31
+# lets export a csv file of the holes and the people responsible for them
32
+with open('holes.csv', 'w') as f:
33
+    f.write('début,fin,taille,responsable1,responsable2,date1,date2\n')
34
+    for hole in holes:
35
+        date_start = datetime.utcfromtimestamp(
36
+            hole.start_message.timestamp_ms / 1000.0).strftime('%Y-%m-%d %H:%M:%S')
37
+        date_end = datetime.utcfromtimestamp(
38
+            hole.end_message.timestamp_ms / 1000.0).strftime('%Y-%m-%d %H:%M:%S')
39
+        f.write(
40
+            f"{hole.start},"
41
+            f"{hole.end},"
42
+            f"{hole.end - hole.start},"
43
+            f"{hole.start_message.sender_name},"
44
+            f"{hole.end_message.sender_name},"
45
+            f"{date_start},{date_end}\n"
46
+        )

Loading…
Cancel
Save