Ticket #21995: duplicate_class_check.py

File duplicate_class_check.py, 3.0 KB (added by taylor.smock, 2 years ago)

Quick script to find duplicate files in jar files

Line 
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4import glob
5import zipfile
6import re
7from typing import List, Iterable, Set
8
9
10class Plugin(object):
11 name: str
12 files: List[str]
13 ignore_list_regex = [re.compile(r'^.*/$'), re.compile(r'data/.*'),
14 re.compile(r"META-INF/LICENSE.*"),
15 re.compile(r"META-INF/.*module-info.class"),
16 re.compile(r"LICENSE.*"), re.compile(r"images/.*")]
17 ignore_list = ["README", "GPL-v2.0.txt", "GPL-v3.0.txt",
18 "GPL-3.0.txt", "META-INF/MANIFEST.MF",
19 "META-INF/INDEX.LIST"]
20
21 @staticmethod
22 def _not_ignored(file: str) -> bool:
23 for ignore in Plugin.ignore_list_regex:
24 if ignore.match(file):
25 return False
26 return file not in Plugin.ignore_list
27
28 @staticmethod
29 def _filter_files(files: List[str]) -> List[str]:
30 return [f for f in files if Plugin._not_ignored(f)]
31
32 def __init__(self, name: str, files: List[str]):
33 self.name = name
34 self.files = Plugin._filter_files(files)
35
36 def __repr__(self) -> str:
37 return self.name
38
39 def __hash__(self):
40 return self.name.__hash__()
41
42 def __eq__(self, other):
43 if isinstance(other, Plugin):
44 return self.name == other.name
45 return False
46
47
48class PluginDuplicates(object):
49 plugin1: str
50 plugin2: str
51 duplicate_files: Set[str]
52
53 def __init__(self, plugin1: str, plugin2: str, duplicate_files: Set[str]):
54 if plugin1 < plugin2:
55 self.plugin1 = plugin1
56 self.plugin2 = plugin2
57 else:
58 self.plugin1 = plugin2
59 self.plugin2 = plugin1
60 self.duplicate_files = duplicate_files
61
62 def __repr__(self) -> str:
63 return self.plugin1 + ', ' + self.plugin2
64
65 def __hash__(self):
66 return self.plugin1.__hash__() + 31 * self.plugin2.__hash__()
67
68 def __eq__(self, other):
69 if isinstance(other, PluginDuplicates):
70 return other.plugin1 == self.plugin1 and self.plugin2 == other.plugin2
71
72
73def read_plugins() -> Iterable[Plugin]:
74 for jar in glob.glob("*.jar"):
75 if jar.endswith("-sources.jar") or jar.endswith("-javadoc.jar"):
76 continue
77 zfile = zipfile.ZipFile(jar)
78 yield Plugin(jar, zfile.namelist())
79
80
81def get_common_files(file_list_1: Iterable[str], file_list_2: Iterable[str]) -> \
82 Set[str]:
83 fl1 = set(file_list_1)
84 fl2 = set(file_list_2)
85 return fl1 & fl2
86
87
88def compare_plugins(plugins: Iterable[Plugin]) -> Iterable[PluginDuplicates]:
89 plugins1 = set(plugins)
90 for p1 in plugins1:
91 for p2 in plugins1:
92 if p1 == p2:
93 continue
94 common_files = get_common_files(p1.files, p2.files)
95 if common_files:
96 yield PluginDuplicates(p1.name, p2.name, common_files)
97
98
99if __name__ == "__main__":
100 plugins = read_plugins()
101 bad_plugins = set(compare_plugins(plugins))
102 for plugin in bad_plugins:
103 print(plugin.plugin1, plugin.plugin2, plugin.duplicate_files)