llvm-project

Форк
0
/
extract_symbols.py 
488 строк · 18.2 Кб
1
#!/usr/bin/env python
2

3
"""A tool for extracting a list of symbols to export
4

5
When exporting symbols from a dll or exe we either need to mark the symbols in
6
the source code as __declspec(dllexport) or supply a list of symbols to the
7
linker. This program automates the latter by inspecting the symbol tables of a
8
list of link inputs and deciding which of those symbols need to be exported.
9

10
We can't just export all the defined symbols, as there's a limit of 65535
11
exported symbols and in clang we go way over that, particularly in a debug
12
build. Therefore a large part of the work is pruning symbols either which can't
13
be imported, or which we think are things that have definitions in public header
14
files (i.e. template instantiations) and we would get defined in the thing
15
importing these symbols anyway.
16
"""
17

18
from __future__ import print_function
19
import sys
20
import re
21
import os
22
import subprocess
23
import multiprocessing
24
import argparse
25
import platform
26

27
# Define a function which extracts a list of pairs of (symbols, is_def) from a
28
# library using llvm-nm becuase it can work both with regular and bitcode files.
29
# We use subprocess.Popen and yield a symbol at a time instead of using
30
# subprocess.check_output and returning a list as, especially on Windows, waiting
31
# for the entire output to be ready can take a significant amount of time.
32
def nm_get_symbols(tool, lib):
33
    # '-P' means the output is in portable format,
34
    # '-g' means we only get global symbols,
35
    # '-Xany' enforce handling both 32- and 64-bit objects on AIX,
36
    # '--no-demangle' ensure that C++ symbol names are not demangled; note
37
    #   that llvm-nm do not demangle by default, but the system nm on AIX does
38
    #   that, so the behavior may change in the future,
39
    # '-p' do not waste time sorting the symbols.
40
    cmd = [tool, "-P", "-g", "-Xany", "--no-demangle", "-p"]
41
    process = subprocess.Popen(
42
        cmd + [lib],
43
        bufsize=1,
44
        stdout=subprocess.PIPE,
45
        stdin=subprocess.PIPE,
46
        universal_newlines=True,
47
    )
48
    process.stdin.close()
49
    for line in process.stdout:
50
        # Look for external symbols that are defined in some section
51
        # The POSIX format is:
52
        #   name   type   value   size
53
        # The -P flag displays the size field for symbols only when applicable,
54
        # so the last field is optional. There's no space after the value field,
55
        # but \s+ match newline also, so \s+\S* will match the optional size field.
56
        match = re.match("^(\S+)\s+[BDGRSTuVW]\s+\S+\s+\S*$", line)
57
        if match:
58
            yield (match.group(1), True)
59
        # Look for undefined symbols, which have type U and may or may not
60
        # (depending on which nm is being used) have value and size.
61
        match = re.match("^(\S+)\s+U\s+(\S+\s+\S*)?$", line)
62
        if match:
63
            yield (match.group(1), False)
64
    process.wait()
65

66

67
# Define a function which determines if the target is 32-bit Windows (as that's
68
# where calling convention name decoration happens).
69
def readobj_is_32bit_windows(tool, lib):
70
    output = subprocess.check_output(
71
        [tool, "--file-header", lib], universal_newlines=True
72
    )
73
    for line in output.splitlines():
74
        match = re.match("Format: (\S+)", line)
75
        if match:
76
            return match.group(1) == "COFF-i386"
77
    return False
78

79

80
# MSVC mangles names to ?<identifier_mangling>@<type_mangling>. By examining the
81
# identifier/type mangling we can decide which symbols could possibly be
82
# required and which we can discard.
83
def should_keep_microsoft_symbol(symbol, calling_convention_decoration):
84
    # Keep unmangled (i.e. extern "C") names
85
    if not "?" in symbol:
86
        if calling_convention_decoration:
87
            # Remove calling convention decoration from names
88
            match = re.match("[_@]([^@]+)", symbol)
89
            if match:
90
                symbol = match.group(1)
91
        # Discard floating point/SIMD constants.
92
        if symbol.startswith(("__xmm@", "__ymm@", "__real@")):
93
            return None
94
        return symbol
95
    # Deleting destructors start with ?_G or ?_E and can be discarded because
96
    # link.exe gives you a warning telling you they can't be exported if you
97
    # don't
98
    elif symbol.startswith("??_G") or symbol.startswith("??_E"):
99
        return None
100
    # An anonymous namespace is mangled as ?A(maybe hex number)@. Any symbol
101
    # that mentions an anonymous namespace can be discarded, as the anonymous
102
    # namespace doesn't exist outside of that translation unit.
103
    elif re.search("\?A(0x\w+)?@", symbol):
104
        return None
105
    # Skip X86GenMnemonicTables functions, they are not exposed from llvm/include/.
106
    elif re.match("\?is[A-Z0-9]*@X86@llvm", symbol):
107
        return None
108
    # Keep mangled llvm:: and clang:: function symbols. How we detect these is a
109
    # bit of a mess and imprecise, but that avoids having to completely demangle
110
    # the symbol name. The outermost namespace is at the end of the identifier
111
    # mangling, and the identifier mangling is followed by the type mangling, so
112
    # we look for (llvm|clang)@@ followed by something that looks like a
113
    # function type mangling. To spot a function type we use (this is derived
114
    # from clang/lib/AST/MicrosoftMangle.cpp):
115
    # <function-type> ::= <function-class> <this-cvr-qualifiers>
116
    #                     <calling-convention> <return-type>
117
    #                     <argument-list> <throw-spec>
118
    # <function-class> ::= [A-Z]
119
    # <this-cvr-qualifiers> ::= [A-Z0-9_]*
120
    # <calling-convention> ::= [A-JQ]
121
    # <return-type> ::= .+
122
    # <argument-list> ::= X   (void)
123
    #                 ::= .+@ (list of types)
124
    #                 ::= .*Z (list of types, varargs)
125
    # <throw-spec> ::= exceptions are not allowed
126
    elif re.search("(llvm|clang)@@[A-Z][A-Z0-9_]*[A-JQ].+(X|.+@|.*Z)$", symbol):
127
        return symbol
128
    return None
129

130

131
# Itanium manglings are of the form _Z<identifier_mangling><type_mangling>. We
132
# demangle the identifier mangling to identify symbols that can be safely
133
# discarded.
134
def should_keep_itanium_symbol(symbol, calling_convention_decoration):
135
    # Start by removing any calling convention decoration (which we expect to
136
    # see on all symbols, even mangled C++ symbols)
137
    if calling_convention_decoration and symbol.startswith("_"):
138
        symbol = symbol[1:]
139
    # Keep unmangled names
140
    if not symbol.startswith("_") and not symbol.startswith("."):
141
        return symbol
142
    # Discard manglings that aren't nested names
143
    match = re.match("_Z(T[VTIS])?(N.+)", symbol)
144
    if not match:
145
        return None
146
    # Demangle the name. If the name is too complex then we don't need to keep
147
    # it, but it the demangling fails then keep the symbol just in case.
148
    try:
149
        names, _ = parse_itanium_nested_name(match.group(2))
150
    except TooComplexName:
151
        return None
152
    if not names:
153
        return symbol
154
    # Keep llvm:: and clang:: names
155
    elif names[0][0] == "4llvm" or names[0][0] == "5clang":
156
        return symbol
157
    # Discard everything else
158
    else:
159
        return None
160

161

162
# Certain kinds of complex manglings we assume cannot be part of a public
163
# interface, and we handle them by raising an exception.
164
class TooComplexName(Exception):
165
    pass
166

167

168
# Parse an itanium mangled name from the start of a string and return a
169
# (name, rest of string) pair.
170
def parse_itanium_name(arg):
171
    # Check for a normal name
172
    match = re.match("(\d+)(.+)", arg)
173
    if match:
174
        n = int(match.group(1))
175
        name = match.group(1) + match.group(2)[:n]
176
        rest = match.group(2)[n:]
177
        return name, rest
178
    # Check for constructor/destructor names
179
    match = re.match("([CD][123])(.+)", arg)
180
    if match:
181
        return match.group(1), match.group(2)
182
    # Assume that a sequence of characters that doesn't end a nesting is an
183
    # operator (this is very imprecise, but appears to be good enough)
184
    match = re.match("([^E]+)(.+)", arg)
185
    if match:
186
        return match.group(1), match.group(2)
187
    # Anything else: we can't handle it
188
    return None, arg
189

190

191
# Parse an itanium mangled template argument list from the start of a string
192
# and throw it away, returning the rest of the string.
193
def skip_itanium_template(arg):
194
    # A template argument list starts with I
195
    assert arg.startswith("I"), arg
196
    tmp = arg[1:]
197
    while tmp:
198
        # Check for names
199
        match = re.match("(\d+)(.+)", tmp)
200
        if match:
201
            n = int(match.group(1))
202
            tmp = match.group(2)[n:]
203
            continue
204
        # Check for substitutions
205
        match = re.match("S[A-Z0-9]*_(.+)", tmp)
206
        if match:
207
            tmp = match.group(1)
208
        # Start of a template
209
        elif tmp.startswith("I"):
210
            tmp = skip_itanium_template(tmp)
211
        # Start of a nested name
212
        elif tmp.startswith("N"):
213
            _, tmp = parse_itanium_nested_name(tmp)
214
        # Start of an expression: assume that it's too complicated
215
        elif tmp.startswith("L") or tmp.startswith("X"):
216
            raise TooComplexName
217
        # End of the template
218
        elif tmp.startswith("E"):
219
            return tmp[1:]
220
        # Something else: probably a type, skip it
221
        else:
222
            tmp = tmp[1:]
223
    return None
224

225

226
# Parse an itanium mangled nested name and transform it into a list of pairs of
227
# (name, is_template), returning (list, rest of string).
228
def parse_itanium_nested_name(arg):
229
    # A nested name starts with N
230
    assert arg.startswith("N"), arg
231
    ret = []
232

233
    # Skip past the N, and possibly a substitution
234
    match = re.match("NS[A-Z0-9]*_(.+)", arg)
235
    if match:
236
        tmp = match.group(1)
237
    else:
238
        tmp = arg[1:]
239

240
    # Skip past CV-qualifiers and ref qualifiers
241
    match = re.match("[rVKRO]*(.+)", tmp)
242
    if match:
243
        tmp = match.group(1)
244

245
    # Repeatedly parse names from the string until we reach the end of the
246
    # nested name
247
    while tmp:
248
        # An E ends the nested name
249
        if tmp.startswith("E"):
250
            return ret, tmp[1:]
251
        # Parse a name
252
        name_part, tmp = parse_itanium_name(tmp)
253
        if not name_part:
254
            # If we failed then we don't know how to demangle this
255
            return None, None
256
        is_template = False
257
        # If this name is a template record that, then skip the template
258
        # arguments
259
        if tmp.startswith("I"):
260
            tmp = skip_itanium_template(tmp)
261
            is_template = True
262
        # Add the name to the list
263
        ret.append((name_part, is_template))
264

265
    # If we get here then something went wrong
266
    return None, None
267

268

269
# Parse a microsoft mangled symbol and return a list of pairs of
270
# (name, is_template). This is very rudimentary and does just enough
271
# in order to determine if the first or second component is a template.
272
def parse_microsoft_mangling(arg):
273
    # If the name doesn't start with ? this isn't a mangled name
274
    if not arg.startswith("?"):
275
        return [(arg, False)]
276
    arg = arg[1:]
277
    components = []
278
    while len(arg) > 0:
279
        # If we see an empty component we've reached the end
280
        if arg.startswith("@"):
281
            return components
282
        # Check for a simple name
283
        match = re.match("(\w+)@(.+)", arg)
284
        if match:
285
            components.append((match.group(1), False))
286
            arg = match.group(2)
287
            continue
288
        # Check for a special function name
289
        match = re.match("(\?_?\w)(.+)", arg)
290
        if match:
291
            components.append((match.group(1), False))
292
            arg = match.group(2)
293
            continue
294
        # Check for a template name
295
        match = re.match("\?\$(\w+)@[^@]+@(.+)", arg)
296
        if match:
297
            components.append((match.group(1), True))
298
            arg = match.group(2)
299
            continue
300
        # Some other kind of name that we can't handle
301
        components.append((arg, False))
302
        return components
303
    return components
304

305

306
def extract_symbols(arg):
307
    llvm_nm_path, should_keep_symbol, calling_convention_decoration, lib = arg
308
    symbol_defs = dict()
309
    symbol_refs = set()
310
    for (symbol, is_def) in nm_get_symbols(llvm_nm_path, lib):
311
        symbol = should_keep_symbol(symbol, calling_convention_decoration)
312
        if symbol:
313
            if is_def:
314
                symbol_defs[symbol] = 1 + symbol_defs.setdefault(symbol, 0)
315
            else:
316
                symbol_refs.add(symbol)
317
    return (symbol_defs, symbol_refs)
318

319

320
def get_template_name(sym, mangling):
321
    # Parse the mangling into a list of (name, is_template)
322
    try:
323
        if mangling == "microsoft":
324
            names = parse_microsoft_mangling(sym)
325
        else:
326
            match = re.match("_Z(T[VTIS])?(N.+)", sym)
327
            if match:
328
                names, _ = parse_itanium_nested_name(match.group(2))
329
            else:
330
                names = None
331
    except TooComplexName:
332
        return None
333

334
    if not names:
335
        return None
336

337
    # If any component is a template then return it
338
    for name, is_template in names:
339
        if is_template:
340
            return name
341

342
    # Not a template
343
    return None
344

345

346
def parse_tool_path(parser, tool, val):
347
    try:
348
        # Close std streams as we don't want any output and we don't
349
        # want the process to wait for something on stdin.
350
        p = subprocess.Popen(
351
            [val],
352
            stdout=subprocess.PIPE,
353
            stderr=subprocess.PIPE,
354
            stdin=subprocess.PIPE,
355
            universal_newlines=True,
356
        )
357
        p.stdout.close()
358
        p.stderr.close()
359
        p.stdin.close()
360
        p.wait()
361
        return val
362
    except Exception:
363
        parser.error(f"Invalid path for {tool}")
364

365

366
if __name__ == "__main__":
367
    parser = argparse.ArgumentParser(
368
        description="Extract symbols to export from libraries"
369
    )
370
    parser.add_argument(
371
        "--mangling",
372
        choices=["itanium", "microsoft"],
373
        required=True,
374
        help="expected symbol mangling scheme",
375
    )
376
    parser.add_argument(
377
        "--nm",
378
        metavar="path",
379
        type=lambda x: parse_tool_path(parser, "nm", x),
380
        help="path to the llvm-nm executable",
381
    )
382
    parser.add_argument(
383
        "--readobj",
384
        metavar="path",
385
        type=lambda x: parse_tool_path(parser, "readobj", x),
386
        help="path to the llvm-readobj executable",
387
    )
388
    parser.add_argument(
389
        "libs",
390
        metavar="lib",
391
        type=str,
392
        nargs="+",
393
        help="libraries to extract symbols from",
394
    )
395
    parser.add_argument("-o", metavar="file", type=str, help="output to file")
396
    args = parser.parse_args()
397

398
    # How we determine which symbols to keep and which to discard depends on
399
    # the mangling scheme
400
    if args.mangling == "microsoft":
401
        should_keep_symbol = should_keep_microsoft_symbol
402
    else:
403
        should_keep_symbol = should_keep_itanium_symbol
404

405
    # Get the list of libraries to extract symbols from
406
    libs = list()
407
    for lib in args.libs:
408
        # When invoked by cmake the arguments are the cmake target names of the
409
        # libraries, so we need to add .lib/.a to the end and maybe lib to the
410
        # start to get the filename. Also allow objects.
411
        suffixes = [".lib", ".a", ".obj", ".o"]
412
        if not any([lib.endswith(s) for s in suffixes]):
413
            for s in suffixes:
414
                if os.path.exists(lib + s):
415
                    lib = lib + s
416
                    break
417
                if os.path.exists("lib" + lib + s):
418
                    lib = "lib" + lib + s
419
                    break
420
        if not any([lib.endswith(s) for s in suffixes]):
421
            print("Don't know what to do with argument " + lib, file=sys.stderr)
422
            exit(1)
423
        libs.append(lib)
424

425
    # Check if calling convention decoration is used by inspecting the first
426
    # library in the list
427
    calling_convention_decoration = readobj_is_32bit_windows(args.readobj, libs[0])
428

429
    # Extract symbols from libraries in parallel. This is a huge time saver when
430
    # doing a debug build, as there are hundreds of thousands of symbols in each
431
    # library.
432
    # FIXME: On AIX, the default pool size can be too big for a logical
433
    #        partition's allocated memory, and can lead to an out of memory
434
    #        IO error. We are setting the pool size to 8 to avoid such
435
    #        errors at the moment, and will look for a graceful solution later.
436
    pool = multiprocessing.Pool(8) if platform.system() == "AIX" \
437
                                   else multiprocessing.Pool()
438
    try:
439
        # Only one argument can be passed to the mapping function, and we can't
440
        # use a lambda or local function definition as that doesn't work on
441
        # windows, so create a list of tuples which duplicates the arguments
442
        # that are the same in all calls.
443
        vals = [
444
            (args.nm, should_keep_symbol, calling_convention_decoration, x)
445
            for x in libs
446
        ]
447
        # Do an async map then wait for the result to make sure that
448
        # KeyboardInterrupt gets caught correctly (see
449
        # http://bugs.python.org/issue8296)
450
        result = pool.map_async(extract_symbols, vals)
451
        pool.close()
452
        libs_symbols = result.get(3600)
453
    except KeyboardInterrupt:
454
        # On Ctrl-C terminate everything and exit
455
        pool.terminate()
456
        pool.join()
457
        exit(1)
458

459
    # Merge everything into a single dict
460
    symbol_defs = dict()
461
    symbol_refs = set()
462
    for (this_lib_defs, this_lib_refs) in libs_symbols:
463
        for k, v in list(this_lib_defs.items()):
464
            symbol_defs[k] = v + symbol_defs.setdefault(k, 0)
465
        for sym in list(this_lib_refs):
466
            symbol_refs.add(sym)
467

468
    # Find which template instantiations are referenced at least once.
469
    template_instantiation_refs = set()
470
    for sym in list(symbol_refs):
471
        template = get_template_name(sym, args.mangling)
472
        if template:
473
            template_instantiation_refs.add(template)
474

475
    # Print symbols which both:
476
    #  * Appear in exactly one input, as symbols defined in multiple
477
    #    objects/libraries are assumed to have public definitions.
478
    #  * Are not a template instantiation that isn't referenced anywhere. This
479
    #    is because we need to export any explicitly instantiated templates,
480
    #    and we expect those to be referenced in some object.
481
    if args.o:
482
        outfile = open(args.o, "w")
483
    else:
484
        outfile = sys.stdout
485
    for k, v in list(symbol_defs.items()):
486
        template = get_template_name(k, args.mangling)
487
        if v == 1 and (not template or template in template_instantiation_refs):
488
            print(k, file=outfile)
489

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.