Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.

export_data-v2.py 3.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
  1. # ------------------------------------------------
  2. # Imports
  3. # ------------------------------------------------
  4. from matplotlib import pyplot as plt
  5. import os
  6. import json
  7. import re
  8. # ------------------------------------------------
  9. # Constants
  10. # ------------------------------------------------
  11. DATA_PATH = 'D:/Files/Data/Messenger/'
  12. OTHER_LABEL = 'Les Autres'
  13. # JSON tags
  14. PARTICIPANTS = 'participants'
  15. MESSAGES = 'messages'
  16. NAME = 'name'
  17. CONTENT = 'content'
  18. TIMESTAMP = 'timestamp_ms'
  19. SENDER = 'sender_name'
  20. # ------------------------------------------------
  21. # Functions
  22. # ------------------------------------------------
  23. def readBrokenFbJson(datafile_path):
  24. # ntm facebook
  25. # https://stackoverflow.com/questions/50008296/facebook-json-badly-encoded
  26. with open(datafile_path, 'rb') as data_file:
  27. binary_data = data_file.read()
  28. replace_func = lambda m: bytes.fromhex(m.group(1).decode())
  29. pattern = rb'\\u00([\da-f]{2})'
  30. repaired = re.sub(pattern, replace_func, binary_data)
  31. return json.loads(repaired.decode('utf8'))
  32. def computeData():
  33. # Tous les fichiers du dossier sont traités sans validation
  34. datafiles_path = [DATA_PATH + filename for filename in os.listdir(DATA_PATH) if filename [-4:] == "json"]
  35. print(datafiles_path)
  36. messages, participants = [], []
  37. for datafile_path in datafiles_path:
  38. datacontent = readBrokenFbJson(datafile_path)
  39. if datacontent is None : continue
  40. participants += datacontent[PARTICIPANTS]
  41. messages += datacontent[MESSAGES]
  42. participants = cleanParticipants(participants)
  43. messages = cleanMessages(messages)
  44. return participants, messages
  45. def cleanParticipants(rawParticipants):
  46. return set([participant[NAME] for participant in rawParticipants])
  47. def cleanMessages(rawMessages):
  48. cleanMessages = [message for message in rawMessages if CONTENT in message]
  49. return sorted(cleanMessages, key = lambda x: x[TIMESTAMP])
  50. # TODO tester l'approche en recherche incrémentale
  51. # Jeu de données du 14/10/2021. 33679 messages conservés
  52. # pour un compte final de 34120. Soit une perte estimée à 1.3%
  53. def filterMessages(messages):
  54. return [msg for msg in messages if re.search('(\d{2,}|^\d$)', msg[CONTENT])]
  55. def computeParticipation(messages):
  56. result = {}
  57. for message in messages:
  58. sender = message[SENDER]
  59. result[sender] = result[sender]+1 if sender in result else 1
  60. return sorted(result.items(), key = lambda x: x[1])
  61. def mergeSmallParticipation(rawParticipation, threshold = 1):
  62. values = [e[1] for e in participation]
  63. labels = [e[0] for e in participation]
  64. totalValues = sum(values)
  65. for idx, value in enumerate(values):
  66. if 100 * value / totalValues >= threshold: break
  67. return [sum(values[0:idx])] + values[idx:], [OTHER_LABEL] + labels[idx:]
  68. def displayParticipation(participation):
  69. values, labels = mergeSmallParticipation(participation)
  70. plt.figure(figsize=(8,7), tight_layout = True)
  71. plt.pie(values,
  72. startangle = 90,
  73. counterclock = False,
  74. labels = labels,
  75. rotatelabels = True)
  76. plt.show()
  77. # ------------------------------------------------
  78. # Main Code
  79. # ------------------------------------------------
  80. participants, messages = computeData()
  81. messages = filterMessages(messages)
  82. participation = computeParticipation(messages)
  83. displayParticipation(participation)