[PATCH] fetch: Support GLEP 75 mirror structure

classic Classic list List threaded Threaded
3 messages Options
Reply | Threaded
Open this post in threaded view
|

[PATCH] fetch: Support GLEP 75 mirror structure

Michał Górny-5
Add a support for the subset of GLEP 75 needed by Gentoo Infra.  This
includes fetching and parsing layout.conf, and support for flat layout
and filename-hash layout with cutoffs being multiplies of 4.

Bug: https://bugs.gentoo.org/646898
Signed-off-by: Michał Górny <[hidden email]>
---
 lib/portage/package/ebuild/fetch.py | 113 +++++++++++++++++++++++++++-
 1 file changed, 109 insertions(+), 4 deletions(-)

diff --git a/lib/portage/package/ebuild/fetch.py b/lib/portage/package/ebuild/fetch.py
index 227bf45ae..692efcc01 100644
--- a/lib/portage/package/ebuild/fetch.py
+++ b/lib/portage/package/ebuild/fetch.py
@@ -7,12 +7,15 @@ __all__ = ['fetch']
 
 import errno
 import io
+import itertools
+import json
 import logging
 import random
 import re
 import stat
 import sys
 import tempfile
+import time
 
 from collections import OrderedDict
 
@@ -27,14 +30,17 @@ portage.proxy.lazyimport.lazyimport(globals(),
  'portage.package.ebuild.doebuild:doebuild_environment,' + \
  '_doebuild_spawn',
  'portage.package.ebuild.prepare_build_dirs:prepare_build_dirs',
+ 'portage.util.configparser:SafeConfigParser,read_configs,NoOptionError',
+ 'portage.util._urlopen:urlopen',
 )
 
 from portage import os, selinux, shutil, _encodings, \
  _movefile, _shell_quote, _unicode_encode
 from portage.checksum import (get_valid_checksum_keys, perform_md5, verify_all,
- _filter_unaccelarated_hashes, _hash_filter, _apply_hash_filter)
+ _filter_unaccelarated_hashes, _hash_filter, _apply_hash_filter,
+ checksum_str)
 from portage.const import BASH_BINARY, CUSTOM_MIRRORS_FILE, \
- GLOBAL_CONFIG_PATH
+ GLOBAL_CONFIG_PATH, CACHE_PATH
 from portage.data import portage_gid, portage_uid, secpass, userpriv_groups
 from portage.exception import FileNotFound, OperationNotPermitted, \
  PortageException, TryAgain
@@ -253,6 +259,104 @@ _size_suffix_map = {
  'Y' : 80,
 }
 
+
+def filename_hash_path(filename, algo, cutoffs):
+ """
+ Get directory path for filename in filename-hash mirror structure.
+
+ @param filename: Filename to fetch
+ @param algo: Hash algorithm
+ @param cutoffs: Cutoff values (n:n...)
+ @return: Directory path
+ """
+
+ fnhash = checksum_str(filename.encode('utf8'), algo)
+ ret = ''
+ for c in cutoffs.split(':'):
+ c = int(c) // 4
+ ret += fnhash[:c] + '/'
+ fnhash = fnhash[c:]
+ return ret
+
+
+def get_mirror_url(mirror_url, filename, eroot):
+ """
+ Get correct fetch URL for a given file, accounting for mirror
+ layout configuration.
+
+ @param mirror_url: Base URL to the mirror (without '/distfiles')
+ @param filename: Filename to fetch
+ @param eroot: EROOT to use for the cache file
+ @return: Full URL to fetch
+ """
+
+ cache_file = os.path.join(eroot, CACHE_PATH, 'mirror-metadata.json')
+ try:
+ with open(cache_file, 'r') as f:
+ cache = json.load(f)
+ except (IOError, ValueError):
+ cache = {}
+
+ ts, layout = cache.get(mirror_url, (0, None))
+ # refresh at least daily
+ if ts < time.time() - 86400:
+ # the default
+ layout = ('flat',)
+
+ try:
+ f = urlopen(mirror_url + '/distfiles/layout.conf')
+ try:
+ data = io.StringIO(f.read().decode('utf8'))
+ finally:
+ f.close()
+ cp = SafeConfigParser()
+ read_configs(cp, [data])
+
+ for i in itertools.count():
+ try:
+ val = tuple(cp.get('structure', '%d' % i).split())
+ if val == ('flat',):
+ pass
+ elif val[0] == 'filename-hash' and len(val) == 3:
+ if val[1] not in get_valid_checksum_keys():
+ continue
+ # validate cutoffs
+ cutoffs_good = False
+ for c in val[2].split(':'):
+ try:
+ c = int(c)
+ except ValueError:
+ break
+ else:
+ if c % 4 != 0:
+ break
+ else:
+ cutoffs_good = True
+ if not cutoffs_good:
+ continue
+ else:
+ # (skip unsupported variant)
+ continue
+ layout = val
+ break
+ except NoOptionError:
+ break
+ except IOError:
+ pass
+
+ cache[mirror_url] = (time.time(), layout)
+ with open(cache_file, 'w') as f:
+ json.dump(cache, f)
+
+ if layout[0] == 'flat':
+ return mirror_url + "/distfiles/" + filename
+ elif layout[0] == 'filename-hash':
+ return (mirror_url + "/distfiles/" +
+ filename_hash_path(filename, *layout[1:]) + filename)
+ else:
+ raise AssertionError("get_mirror_url() got unknown layout type")
+
+
 def fetch(myuris, mysettings, listonly=0, fetchonly=0,
  locks_in_subdir=".locks", use_locks=1, try_mirrors=1, digests=None,
  allow_missing_digests=True):
@@ -434,8 +538,9 @@ def fetch(myuris, mysettings, listonly=0, fetchonly=0,
  for myfile, myuri in file_uri_tuples:
  if myfile not in filedict:
  filedict[myfile]=[]
- for y in range(0,len(locations)):
- filedict[myfile].append(locations[y]+"/distfiles/"+myfile)
+ for l in locations:
+ filedict[myfile].append(get_mirror_url(l, myfile,
+ mysettings["EROOT"]))
  if myuri is None:
  continue
  if myuri[:9]=="mirror://":
--
2.23.0


Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] fetch: Support GLEP 75 mirror structure

Alec Warner-2


On Thu, Oct 3, 2019 at 7:52 AM Michał Górny <[hidden email]> wrote:
Add a support for the subset of GLEP 75 needed by Gentoo Infra.  This
includes fetching and parsing layout.conf, and support for flat layout
and filename-hash layout with cutoffs being multiplies of 4.

Bug: https://bugs.gentoo.org/646898
Signed-off-by: Michał Górny <[hidden email]>
---
 lib/portage/package/ebuild/fetch.py | 113 +++++++++++++++++++++++++++-
 1 file changed, 109 insertions(+), 4 deletions(-)

diff --git a/lib/portage/package/ebuild/fetch.py b/lib/portage/package/ebuild/fetch.py
index 227bf45ae..692efcc01 100644
--- a/lib/portage/package/ebuild/fetch.py
+++ b/lib/portage/package/ebuild/fetch.py
@@ -7,12 +7,15 @@ __all__ = ['fetch']

 import errno
 import io
+import itertools
+import json
 import logging
 import random
 import re
 import stat
 import sys
 import tempfile
+import time

 from collections import OrderedDict

@@ -27,14 +30,17 @@ portage.proxy.lazyimport.lazyimport(globals(),
        'portage.package.ebuild.doebuild:doebuild_environment,' + \
                '_doebuild_spawn',
        'portage.package.ebuild.prepare_build_dirs:prepare_build_dirs',
+       'portage.util.configparser:SafeConfigParser,read_configs,NoOptionError',
+       'portage.util._urlopen:urlopen',
 )

 from portage import os, selinux, shutil, _encodings, \
        _movefile, _shell_quote, _unicode_encode
 from portage.checksum import (get_valid_checksum_keys, perform_md5, verify_all,
-       _filter_unaccelarated_hashes, _hash_filter, _apply_hash_filter)
+       _filter_unaccelarated_hashes, _hash_filter, _apply_hash_filter,
+       checksum_str)
 from portage.const import BASH_BINARY, CUSTOM_MIRRORS_FILE, \
-       GLOBAL_CONFIG_PATH
+       GLOBAL_CONFIG_PATH, CACHE_PATH
 from portage.data import portage_gid, portage_uid, secpass, userpriv_groups
 from portage.exception import FileNotFound, OperationNotPermitted, \
        PortageException, TryAgain
@@ -253,6 +259,104 @@ _size_suffix_map = {
        'Y' : 80,
 }

+
+def filename_hash_path(filename, algo, cutoffs):
+       """
+       Get directory path for filename in filename-hash mirror structure.
+
+       @param filename: Filename to fetch
+       @param algo: Hash algorithm
+       @param cutoffs: Cutoff values (n:n...)
+       @return: Directory path
+       """
+
+       fnhash = checksum_str(filename.encode('utf8'), algo)
+       ret = ''
+       for c in cutoffs.split(':'):
+               c = int(c) // 4
+               ret += fnhash[:c] + '/'

When making a path, please use os.path.join()
 
+               fnhash = fnhash[c:]
+       return ret
+
+
+def get_mirror_url(mirror_url, filename, eroot):
+       """
+       Get correct fetch URL for a given file, accounting for mirror
+       layout configuration.
+
+       @param mirror_url: Base URL to the mirror (without '/distfiles')
+       @param filename: Filename to fetch
+       @param eroot: EROOT to use for the cache file
+       @return: Full URL to fetch
+       """
+
+       cache_file = os.path.join(eroot, CACHE_PATH, 'mirror-metadata.json')
+       try:
+               with open(cache_file, 'r') as f:
+                       cache = json.load(f)
+       except (IOError, ValueError):
+               cache = {}

I'm a bit worried that we are opening this cache file off of disk every time we call get_mirror_url(). Can we just cache the contents in memory between calls; or even better pass the cache in as argument rather than it be contained in get_mirror_url?
 
+
+       ts, layout = cache.get(mirror_url, (0, None))
+       # refresh at least daily
+       if ts < time.time() - 86400:
+               # the default
+               layout = ('flat',)
+
+               try:
+                       f = urlopen(mirror_url + '/distfiles/layout.conf')
+                       try:
+                               data = io.StringIO(f.read().decode('utf8'))
+                       finally:
+                               f.close()
+                       cp = SafeConfigParser()
+                       read_configs(cp, [data])
+
+                       for i in itertools.count():
+                               try:
+                                       val = tuple(cp.get('structure', '%d' % i).split())
+                                       if val == ('flat',):
+                                               pass
+                                       elif val[0] == 'filename-hash' and len(val) == 3:
+                                               if val[1] not in get_valid_checksum_keys():
+                                                       continue
+                                               # validate cutoffs
+                                               cutoffs_good = False
+                                               for c in val[2].split(':'):
+                                                       try:
+                                                               c = int(c)
+                                                       except ValueError:
+                                                               break
+                                                       else:
+                                                               if c % 4 != 0:
+                                                                       break
+                                               else:
+                                                       cutoffs_good = True
+                                               if not cutoffs_good:
+                                                       continue
+                                       else:
+                                               # (skip unsupported variant)
+                                               continue
+                                       layout = val
+                                       break
+                               except NoOptionError:
+                                       break
+               except IOError:
+                       pass
+
+               cache[mirror_url] = (time.time(), layout)
+               with open(cache_file, 'w') as f:
+                       json.dump(cache, f)
+
+       if layout[0] == 'flat':
+               return mirror_url + "/distfiles/" + filename
+       elif layout[0] == 'filename-hash':
+               return (mirror_url + "/distfiles/" +
+                               filename_hash_path(filename, *layout[1:]) + filename)
+       else:
+               raise AssertionError("get_mirror_url() got unknown layout type")

raise AssertionError("get_mirror_url() got unknown layout type %s wanted one of %s" % (layout[0], ('flat', 'filename-hash')))

E.g. if you got an unknown thing, it's nice to print what you want and what you wanted so callers can fix it.
 
+
+
 def fetch(myuris, mysettings, listonly=0, fetchonly=0,
        locks_in_subdir=".locks", use_locks=1, try_mirrors=1, digests=None,
        allow_missing_digests=True):
@@ -434,8 +538,9 @@ def fetch(myuris, mysettings, listonly=0, fetchonly=0,
        for myfile, myuri in file_uri_tuples:
                if myfile not in filedict:
                        filedict[myfile]=[]
-                       for y in range(0,len(locations)):
-                               filedict[myfile].append(locations[y]+"/distfiles/"+myfile)
+                       for l in locations:
+                               filedict[myfile].append(get_mirror_url(l, myfile,
+                                               mysettings["EROOT"]))
                if myuri is None:
                        continue
                if myuri[:9]=="mirror://":
--
2.23.0


Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] fetch: Support GLEP 75 mirror structure

Michał Górny-5
On Thu, 2019-10-03 at 21:58 -0700, Alec Warner wrote:

> On Thu, Oct 3, 2019 at 7:52 AM Michał Górny <[hidden email]> wrote:
>
> > Add a support for the subset of GLEP 75 needed by Gentoo Infra.  This
> > includes fetching and parsing layout.conf, and support for flat layout
> > and filename-hash layout with cutoffs being multiplies of 4.
> >
> > Bug: https://bugs.gentoo.org/646898
> > Signed-off-by: Michał Górny <[hidden email]>
> > ---
> >  lib/portage/package/ebuild/fetch.py | 113 +++++++++++++++++++++++++++-
> >  1 file changed, 109 insertions(+), 4 deletions(-)
> >
> > diff --git a/lib/portage/package/ebuild/fetch.py
> > b/lib/portage/package/ebuild/fetch.py
> > index 227bf45ae..692efcc01 100644
> > --- a/lib/portage/package/ebuild/fetch.py
> > +++ b/lib/portage/package/ebuild/fetch.py
> > @@ -7,12 +7,15 @@ __all__ = ['fetch']
> >
> >  import errno
> >  import io
> > +import itertools
> > +import json
> >  import logging
> >  import random
> >  import re
> >  import stat
> >  import sys
> >  import tempfile
> > +import time
> >
> >  from collections import OrderedDict
> >
> > @@ -27,14 +30,17 @@ portage.proxy.lazyimport.lazyimport(globals(),
> >         'portage.package.ebuild.doebuild:doebuild_environment,' + \
> >                 '_doebuild_spawn',
> >         'portage.package.ebuild.prepare_build_dirs:prepare_build_dirs',
> > +
> >  'portage.util.configparser:SafeConfigParser,read_configs,NoOptionError',
> > +       'portage.util._urlopen:urlopen',
> >  )
> >
> >  from portage import os, selinux, shutil, _encodings, \
> >         _movefile, _shell_quote, _unicode_encode
> >  from portage.checksum import (get_valid_checksum_keys, perform_md5,
> > verify_all,
> > -       _filter_unaccelarated_hashes, _hash_filter, _apply_hash_filter)
> > +       _filter_unaccelarated_hashes, _hash_filter, _apply_hash_filter,
> > +       checksum_str)
> >  from portage.const import BASH_BINARY, CUSTOM_MIRRORS_FILE, \
> > -       GLOBAL_CONFIG_PATH
> > +       GLOBAL_CONFIG_PATH, CACHE_PATH
> >  from portage.data import portage_gid, portage_uid, secpass,
> > userpriv_groups
> >  from portage.exception import FileNotFound, OperationNotPermitted, \
> >         PortageException, TryAgain
> > @@ -253,6 +259,104 @@ _size_suffix_map = {
> >         'Y' : 80,
> >  }
> >
> > +
> > +def filename_hash_path(filename, algo, cutoffs):
> > +       """
> > +       Get directory path for filename in filename-hash mirror structure.
> > +
> > +       @param filename: Filename to fetch
> > +       @param algo: Hash algorithm
> > +       @param cutoffs: Cutoff values (n:n...)
> > +       @return: Directory path
> > +       """
> > +
> > +       fnhash = checksum_str(filename.encode('utf8'), algo)
> > +       ret = ''
> > +       for c in cutoffs.split(':'):
> > +               c = int(c) // 4
> > +               ret += fnhash[:c] + '/'
> >
>
> When making a path, please use os.path.join()
This is URL, not a path.

>
>
> > +               fnhash = fnhash[c:]
> > +       return ret
> > +
> > +
> > +def get_mirror_url(mirror_url, filename, eroot):
> > +       """
> > +       Get correct fetch URL for a given file, accounting for mirror
> > +       layout configuration.
> > +
> > +       @param mirror_url: Base URL to the mirror (without '/distfiles')
> > +       @param filename: Filename to fetch
> > +       @param eroot: EROOT to use for the cache file
> > +       @return: Full URL to fetch
> > +       """
> >
> +
> > +       cache_file = os.path.join(eroot, CACHE_PATH,
> > 'mirror-metadata.json')
> > +       try:
> > +               with open(cache_file, 'r') as f:
> > +                       cache = json.load(f)
> > +       except (IOError, ValueError):
> > +               cache = {}
> >
>
> I'm a bit worried that we are opening this cache file off of disk every
> time we call get_mirror_url(). Can we just cache the contents in memory
> between calls; or even better pass the cache in as argument rather than it
> be contained in get_mirror_url?
We could but this is no bottleneck.  That's premature optimization,
the way I see it.

>
>
> > +
> > +       ts, layout = cache.get(mirror_url, (0, None))
> > +       # refresh at least daily
> > +       if ts < time.time() - 86400:
> > +               # the default
> > +               layout = ('flat',)
> > +
> > +               try:
> > +                       f = urlopen(mirror_url + '/distfiles/layout.conf')
> > +                       try:
> > +                               data = io.StringIO(f.read().decode('utf8'))
> > +                       finally:
> > +                               f.close()
> > +                       cp = SafeConfigParser()
> > +                       read_configs(cp, [data])
> > +
> > +                       for i in itertools.count():
> > +                               try:
> > +                                       val = tuple(cp.get('structure',
> > '%d' % i).split())
> > +                                       if val == ('flat',):
> > +                                               pass
> > +                                       elif val[0] == 'filename-hash' and
> > len(val) == 3:
> > +                                               if val[1] not in
> > get_valid_checksum_keys():
> > +                                                       continue
> > +                                               # validate cutoffs
> > +                                               cutoffs_good = False
> > +                                               for c in val[2].split(':'):
> > +                                                       try:
> > +                                                               c = int(c)
> > +                                                       except ValueError:
> > +                                                               break
> > +                                                       else:
> > +                                                               if c % 4
> > != 0:
> > +
> >  break
> > +                                               else:
> > +                                                       cutoffs_good = True
> > +                                               if not cutoffs_good:
> > +                                                       continue
> > +                                       else:
> > +                                               # (skip unsupported
> > variant)
> > +                                               continue
> > +                                       layout = val
> > +                                       break
> > +                               except NoOptionError:
> > +                                       break
> > +               except IOError:
> > +                       pass
> > +
> > +               cache[mirror_url] = (time.time(), layout)
> > +               with open(cache_file, 'w') as f:
> > +                       json.dump(cache, f)
> > +
> > +       if layout[0] == 'flat':
> > +               return mirror_url + "/distfiles/" + filename
> > +       elif layout[0] == 'filename-hash':
> > +               return (mirror_url + "/distfiles/" +
> > +                               filename_hash_path(filename, *layout[1:])
> > + filename)
> > +       else:
> > +               raise AssertionError("get_mirror_url() got unknown layout
> > type")
> >
>
> raise AssertionError("get_mirror_url() got unknown layout type %s wanted
> one of %s" % (layout[0], ('flat', 'filename-hash')))
>
> E.g. if you got an unknown thing, it's nice to print what you want and what
> you wanted so callers can fix it.
Assertions are not for callers.  They merely check that half
of the functions wasn't updated, and the other half left.

>
>
> > +
> > +
> >  def fetch(myuris, mysettings, listonly=0, fetchonly=0,
> >         locks_in_subdir=".locks", use_locks=1, try_mirrors=1, digests=None,
> >         allow_missing_digests=True):
> > @@ -434,8 +538,9 @@ def fetch(myuris, mysettings, listonly=0, fetchonly=0,
> >         for myfile, myuri in file_uri_tuples:
> >                 if myfile not in filedict:
> >                         filedict[myfile]=[]
> > -                       for y in range(0,len(locations)):
> > -
> >  filedict[myfile].append(locations[y]+"/distfiles/"+myfile)
> > +                       for l in locations:
> > +                               filedict[myfile].append(get_mirror_url(l,
> > myfile,
> > +                                               mysettings["EROOT"]))
> >                 if myuri is None:
> >                         continue
> >                 if myuri[:9]=="mirror://":
> > --
> > 2.23.0
> >
> >
> >
--
Best regards,
Michał Górny


signature.asc (631 bytes) Download Attachment